In [16]:
import pandas as pd
data = {'name': ['Alice', 'Bob', 'Charlie', 'David'], 
       'age': [25, 30, 35, 40],
       'city': ['New York', 'Los Angeles', 'Chicago', 'Houston']}

pd.DataFrame(data) creates a DataFrame from the data dictionary. Each key in the dictionary becomes a column in the DataFrame, and the values associated with the keys become the data in the respective columns.

In [5]:
df1 = pd.DataFrame(data)
df1

Unnamed: 0,name,age,city
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston


In [6]:
data = [['Alice', 25, 'N'], 
       ['Bob', 30, 'LA'], 
       ['Cher', 29, 'Ch'],
       ['Dave', 20, 'Phil']]
df1 = pd.DataFrame(data, columns=['Name', 'Age', 'City'])
df1

Unnamed: 0,Name,Age,City
0,Alice,25,N
1,Bob,30,LA
2,Cher,29,Ch
3,Dave,20,Phil


In [8]:
names = pd.Series(['Al', 'Jake', 'Bru'])
data = {'Name': names}
df = pd.DataFrame(data)
df

Unnamed: 0,Name
0,Al
1,Jake
2,Bru


In [10]:
c = pd.Series([10, 20, 30, 40])
c

0    10
1    20
2    30
3    40
dtype: int64


Custom Index: When you want to assign specific labels to the elements of your Series, especially if the default index (0, 1, 2, ...) is not meaningful in your context.
Alignment: When you need to align two or more Series based on their indexes for operations like addition or merging.

In [12]:
d = {'apple': 10, 'banana': 20, 'cantaloupe': 90}
s = pd.Series(d)
s

apple         10
banana        20
cantaloupe    90
dtype: int64

In [13]:
s = pd.Series(5, index=['a', 'b', 'c'])
s

a    5
b    5
c    5
dtype: int64

In [15]:
data = [10, 20, 30]
custom_index = ['a', 'b', 'c']
s = pd.Series(data, index=custom_index)
s

a    10
b    20
c    30
dtype: int64

In [45]:
data = {'name': ['Alice', 'Bob', 'Charlie', 'David'], 
       'age': [25, 30, 35, 40],
       'city': ['New York', 'Los Angeles', 'Chicago', 'Houston']}

In [36]:
df = pd.DataFrame(data)
df

Unnamed: 0,name,age,city
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,Houston


loc[]: This is label-based indexing, where you specify the row and column labels. It is inclusive of both the start and stop labels

iloc[]: This is integer-based indexing, where you specify the row and column positions. It is exclusive of the stop position.

loc[]: df.loc[row_label, column_label]
iloc[]: df.iloc[row_position, column_position

In [30]:
print(df.loc[0, :])

name       Alice
age           25
city    New York
Name: 0, dtype: object


In [31]:
print(df.iloc[0, 1])

25


In [32]:
print(df.loc[0])  
print(df.iloc[1]) 
print(df.iloc[0:3, -2:])

name       Alice
age           25
city    New York
Name: 0, dtype: object
name            Bob
age              30
city    Los Angeles
Name: 1, dtype: object
   age         city
0   25     New York
1   30  Los Angeles
2   35      Chicago


In [37]:
df['Gender'] = ['Fem', 'M', 'F', 'F']
df

Unnamed: 0,name,age,city,Gender
0,Alice,25,New York,Fem
1,Bob,30,Los Angeles,M
2,Charlie,35,Chicago,F
3,David,40,Houston,F


In [39]:
filtered_df = df[df['age']>30]
filtered_df

Unnamed: 0,name,age,city,Gender
2,Charlie,35,Chicago,F
3,David,40,Houston,F


In [40]:
df['siblings'] = [2, 3, 4, 5]
df

Unnamed: 0,name,age,city,Gender,siblings
0,Alice,25,New York,Fem,2
1,Bob,30,Los Angeles,M,3
2,Charlie,35,Chicago,F,4
3,David,40,Houston,F,5


In [41]:
df['Siblings and age'] = df['age'] + df['siblings']
df

Unnamed: 0,name,age,city,Gender,siblings,Siblings and age
0,Alice,25,New York,Fem,2,27
1,Bob,30,Los Angeles,M,3,33
2,Charlie,35,Chicago,F,4,39
3,David,40,Houston,F,5,45


In [44]:
df['bonus'] = [20, 30, 20, 30]
df[df['bonus'] + df['age'] > 60]

Unnamed: 0,name,age,city,Gender,siblings,Siblings and age,bonus
3,David,40,Houston,F,5,45,30


### Handling Missing Data

In [84]:
import numpy as np
data = {'name': ['Alice', 'Bob', np.nan, 'David'], 
       'age': [25, np.nan, 35, 40],
       'city': ['New York', 'Los Angeles', 'Chicago', 'Houston']}

In [85]:
df = pd.DataFrame(data)

In [58]:
print(df.isnull())

    name    age   city
0  False  False  False
1  False   True  False
2   True  False  False
3  False  False  False


In [59]:
cleaned_df = df.dropna()
cleaned_df

Unnamed: 0,name,age,city
0,Alice,25.0,New York
3,David,40.0,Houston


In [68]:
d = df.isnull()
d

Unnamed: 0,name,age,city
0,False,False,False
1,False,False,False
2,True,False,False
3,False,False,False


In [67]:
df['age'] = np.mean(df['age'])
df

Unnamed: 0,name,age,city
0,Alice,33.333333,New York
1,Bob,33.333333,Los Angeles
2,,33.333333,Chicago
3,David,33.333333,Houston


In [81]:
mean_age = df['age'].mean()

In [82]:
df['age'] = df['age'].fillna(mean_age)
df

Unnamed: 0,name,age,city
0,Alice,25.0,New York
1,Bob,33.333333,Los Angeles
2,,35.0,Chicago
3,David,40.0,Houston


Fill Missing Values with Forward or Backward Values:

Use the fillna() function with method='ffill' (forward fill) or method='bfill' (backward fill) to fill missing values with the previous or next value.

In [88]:
df.fillna(method='ffill', inplace=True)
df

  df.fillna(method='ffill', inplace=True)


Unnamed: 0,name,age,city
0,Alice,25.0,New York
1,Bob,30.0,Los Angeles
2,Bob,35.0,Chicago
3,David,40.0,Houston


Filling missing values with a specific value

In [89]:
df.fillna(value={'age': 30, 'City': 'Philly'}, inplace=True)
df

Unnamed: 0,name,age,city
0,Alice,25.0,New York
1,Bob,30.0,Los Angeles
2,Bob,35.0,Chicago
3,David,40.0,Houston


### Grouping and aggregating data:

In [94]:
data = {'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank'],
        'age': [25, 30, 35, 40, 45, 50],
        'city': ['New York', 'Los Angeles', 'Chicago', 'New York', 'Chicago', 'Los Angeles']}

In [102]:
df = pd.DataFrame(data)
df

Unnamed: 0,name,age,city
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago
3,David,40,New York
4,Eve,45,Chicago
5,Frank,50,Los Angeles


Grouping data based on one or more columns and performing aggregate functions on the grouped data.

In [100]:
average_age = df.groupby('city')['age'].mean()

In [101]:
average_age

city
Chicago        40.0
Los Angeles    40.0
New York       32.5
Name: age, dtype: float64

In [104]:
total_people = df.groupby('city')['name'].count()
total_people

city
Chicago        2
Los Angeles    2
New York       2
Name: name, dtype: int64

In [107]:
max_min = df.groupby('city')['age'].apply(lambda x: x.max() - x.min())
max_min

city
Chicago        10
Los Angeles    20
New York       15
Name: age, dtype: int64

## Normalize ages within each city:

In [109]:
def normalize_age(group):
    mean_age = group['age'].mean()
    std_age = group['age'].std()
    return (group['age'] - mean_age) / std_age

normalized_ages = df.groupby('city').apply(normalize_age)
normalized_ages

city          
Chicago      2   -0.707107
             4    0.707107
Los Angeles  1   -0.707107
             5    0.707107
New York     0   -0.707107
             3    0.707107
Name: age, dtype: float64

In [127]:
l1 = [5, 2, 1, 9, 8, 0, 4] 
l2 = [1, 10, 8, 3,9]

In [128]:
Output : [0, 0, 1, 1, 1, 0, 0]

In [131]:
def if_exists(l1, l2):
    l1 = np.array(l1)
    l2 = np.array(l2)
    return np.isin(l1, l2)

In [132]:
result = if_exists(l1, l2)
result

array([False, False,  True,  True,  True, False, False])

In [134]:
l1 = np.array(l1)
l2 = np.array(l2)