# Pandas

In [2]:
import pandas as pd
import numpy as np

In [16]:
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [24, 27, 22, 32],
    'Pet': ['Cat', 'Dog', 'Fish', 'Bird']
})

In [4]:
df

Unnamed: 0,Name,Age,Pet
0,Alice,24,Cat
1,Bob,27,Dog
2,Charlie,22,Fish
3,David,32,Bird


# Inplace vs Copy

In [5]:
df.set_index('Name')

Unnamed: 0_level_0,Age,Pet
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Alice,24,Cat
Bob,27,Dog
Charlie,22,Fish
David,32,Bird


In [6]:
df

Unnamed: 0,Name,Age,Pet
0,Alice,24,Cat
1,Bob,27,Dog
2,Charlie,22,Fish
3,David,32,Bird


In [7]:
df.set_index('Name', inplace=True)

In [8]:
df

Unnamed: 0_level_0,Age,Pet
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Alice,24,Cat
Bob,27,Dog
Charlie,22,Fish
David,32,Bird


## iloc

In [17]:
df

Unnamed: 0,Name,Age,Pet
0,Alice,24,Cat
1,Bob,27,Dog
2,Charlie,22,Fish
3,David,32,Bird


In [18]:
df.iloc[0]

Name    Alice
Age        24
Pet       Cat
Name: 0, dtype: object

In [19]:
df.iloc[0,2]

'Cat'

In [20]:
df.iloc[0:2]

Unnamed: 0,Name,Age,Pet
0,Alice,24,Cat
1,Bob,27,Dog


## loc

In [32]:
# Setting 'Name' as the index
df.set_index('Name', inplace=True)

In [33]:
df

Unnamed: 0_level_0,Age,Pet
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Alice,24,Cat
Bob,27,Dog
Charlie,22,Fish
David,32,Bird


In [35]:
df.loc['Alice']

Age     24
Pet    Cat
Name: Alice, dtype: object

## Advanced Indexing

In [24]:
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [24, 27, 22, 32],
    'Pet': ['Cat', 'Dog', 'Fish', 'Bird']
    
})

In [31]:
df

Unnamed: 0,Name,Age,Pet
0,Alice,24,Cat
1,Bob,27,Dog
2,Charlie,22,Fish
3,David,32,Bird


In [26]:
# Create a boolean array using NumPy
boolean_array = np.array([True, False, True, False])

In [27]:
df.iloc[boolean_array]

Unnamed: 0,Name,Age,Pet
0,Alice,24,Cat
2,Charlie,22,Fish


In [28]:
df.iloc[df['Age'] > 25]

NotImplementedError: iLocation based boolean indexing on an integer type is not available

In [29]:
df.loc[df['Age'] > 25]

Unnamed: 0,Name,Age,Pet
1,Bob,27,Dog
3,David,32,Bird


In [30]:
df.loc[df['Pet'].isin(['Cat', 'Dog'])]

Unnamed: 0,Name,Age,Pet
0,Alice,24,Cat
1,Bob,27,Dog


## apply

In [36]:
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [24, 27, 22, 32],
    'Pet': ['Cat', 'Dog', 'Fish', 'Bird']
})
df

Unnamed: 0,Name,Age,Pet
0,Alice,24,Cat
1,Bob,27,Dog
2,Charlie,22,Fish
3,David,32,Bird


In [37]:
df['Age_squared'] = df['Age'].apply(lambda x: x ** 2)
df

Unnamed: 0,Name,Age,Pet,Age_squared
0,Alice,24,Cat,576
1,Bob,27,Dog,729
2,Charlie,22,Fish,484
3,David,32,Bird,1024


In [38]:
def square_age(age):
    return age ** 2

In [39]:
df['Age_squared_2'] = df['Age'].apply(square_age)
df

Unnamed: 0,Name,Age,Pet,Age_squared,Age_squared_2
0,Alice,24,Cat,576,576
1,Bob,27,Dog,729,729
2,Charlie,22,Fish,484,484
3,David,32,Bird,1024,1024


## a tricky question

In [40]:
df1 = pd.DataFrame(np.arange(9).reshape((3, 3)),
                  columns=["a", "b", "c"])
df2 = pd.DataFrame(np.arange(12).reshape((3, 4)),
                  columns=["a", "b", "c", "d"])

In [41]:
df1

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8


In [42]:
df2

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [43]:
df1 + df2

Unnamed: 0,a,b,c,d
0,0,2,4,
1,7,9,11,
2,14,16,18,


In [44]:
df1

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8


# Data Cleaning

## Missing Data

In [45]:
# Create a DataFrame with missing data
data = {'Name': ['Alice', 'Bob', 'Catherine', 'David', 'Eve', 'Frank'],
        'Age': [34, None, 28, 45, 23, 32],
        'Gender': ['Female', 'Male', 'Female', None, 'Female', None],
        'Income': [50000, 45000, None, 60000, 35000, 40000]}
df = pd.DataFrame(data)

# Save to CSV
df.to_csv('missing_data.csv', index=False)

In [46]:
df = pd.read_csv('missing_data.csv')
df

Unnamed: 0,Name,Age,Gender,Income
0,Alice,34.0,Female,50000.0
1,Bob,,Male,45000.0
2,Catherine,28.0,Female,
3,David,45.0,,60000.0
4,Eve,23.0,Female,35000.0
5,Frank,32.0,,40000.0


In [47]:
df_dropna = df.dropna()
df_dropna

Unnamed: 0,Name,Age,Gender,Income
0,Alice,34.0,Female,50000.0
4,Eve,23.0,Female,35000.0


In [48]:
df_fill_zero = df.fillna(0)
df_fill_zero

Unnamed: 0,Name,Age,Gender,Income
0,Alice,34.0,Female,50000.0
1,Bob,0.0,Male,45000.0
2,Catherine,28.0,Female,0.0
3,David,45.0,0,60000.0
4,Eve,23.0,Female,35000.0
5,Frank,32.0,0,40000.0


In [49]:
df_forward_fill = df.fillna(method='ffill')
df_forward_fill

Unnamed: 0,Name,Age,Gender,Income
0,Alice,34.0,Female,50000.0
1,Bob,34.0,Male,45000.0
2,Catherine,28.0,Female,45000.0
3,David,45.0,Female,60000.0
4,Eve,23.0,Female,35000.0
5,Frank,32.0,Female,40000.0


In [50]:
df_fill_mean = df.fillna(df.mean(numeric_only=True))
df_fill_mean

Unnamed: 0,Name,Age,Gender,Income
0,Alice,34.0,Female,50000.0
1,Bob,32.4,Male,45000.0
2,Catherine,28.0,Female,46000.0
3,David,45.0,,60000.0
4,Eve,23.0,Female,35000.0
5,Frank,32.0,,40000.0


## Inconsistant Data (extra feature)

In [51]:
# Adding inconsistent data to the DataFrame
data = {'Name': ['Alice', 'Bob', 'Catherine', 'David', 'Eve', 'Frank', 'Grace'],
        'Age': [34, None, 28, 45, 23, 32, 40],
        'Gender': ['Female', 'Male', 'Female', None, 'Female', None, 'Male'],
        'Income': [50000, 45000, None, 60000, 35000, 40000, 42000]}
df = pd.DataFrame(data)
df.loc[df['Name'] == 'Grace', 'ExtraColumn'] = 'ExtraColumnValue'

# Save to CSV
df.to_csv('inconsistent_data.csv', index=False)

In [52]:
df = pd.read_csv('inconsistent_data.csv')
df

Unnamed: 0,Name,Age,Gender,Income,ExtraColumn
0,Alice,34.0,Female,50000.0,
1,Bob,,Male,45000.0,
2,Catherine,28.0,Female,,
3,David,45.0,,60000.0,
4,Eve,23.0,Female,35000.0,
5,Frank,32.0,,40000.0,
6,Grace,40.0,Male,42000.0,ExtraColumnValue


In [53]:
df_drop = df.dropna(axis=0, how='all', subset=['ExtraColumn'])
df_drop

Unnamed: 0,Name,Age,Gender,Income,ExtraColumn
6,Grace,40.0,Male,42000.0,ExtraColumnValue


In [54]:
df_standardize = df[['Name', 'Age', 'Gender', 'Income']]
df_standardize

Unnamed: 0,Name,Age,Gender,Income
0,Alice,34.0,Female,50000.0
1,Bob,,Male,45000.0
2,Catherine,28.0,Female,
3,David,45.0,,60000.0
4,Eve,23.0,Female,35000.0
5,Frank,32.0,,40000.0
6,Grace,40.0,Male,42000.0


In [55]:
df['ExtraColumn'].fillna('DefaultValue', inplace=True)
df

Unnamed: 0,Name,Age,Gender,Income,ExtraColumn
0,Alice,34.0,Female,50000.0,DefaultValue
1,Bob,,Male,45000.0,DefaultValue
2,Catherine,28.0,Female,,DefaultValue
3,David,45.0,,60000.0,DefaultValue
4,Eve,23.0,Female,35000.0,DefaultValue
5,Frank,32.0,,40000.0,DefaultValue
6,Grace,40.0,Male,42000.0,ExtraColumnValue


## Inconsistant Data Type

In [61]:
# Adding inconsistent data types to the DataFrame
data = {'Name': ['Alice', 'Bob', 'Catherine', 'David', 'Eve', 'Frank', 'Grace'],
        'Age': [34, None, 'male', 4, 23, 32, 40],
        'Gender': ['Female', 1.1, 'Female', None, 'Female', None, 'Male'],
        'Income': [50000, 45000, None, 60000, '35000.0', 40000, "42000"]}
df = pd.DataFrame(data)

# Save to CSV
df.to_csv('inconsistent_datatypes.csv', index=False)

In [62]:
df = pd.read_csv('inconsistent_datatypes.csv')
df

Unnamed: 0,Name,Age,Gender,Income
0,Alice,34,Female,50000.0
1,Bob,,1.1,45000.0
2,Catherine,male,Female,
3,David,4,,60000.0
4,Eve,23,Female,35000.0
5,Frank,32,,40000.0
6,Grace,40,Male,42000.0


In [59]:
df.dtypes

Name       object
Age        object
Gender     object
Income    float64
dtype: object

In [60]:
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
df

Unnamed: 0,Name,Age,Gender,Income
0,Alice,34.0,Female,50000.0
1,Bob,,1.1,45000.0
2,Catherine,,Female,
3,David,4.0,,60000.0
4,Eve,23.0,Female,35000.0
5,Frank,32.0,,40000.0
6,Grace,40.0,Male,42000.0


In [63]:
df['Age'] = pd.to_numeric(df['Age'])

ValueError: Unable to parse string "male" at position 2

# Generator

In [64]:
def count_up_to(max):
    count = 1
    while count <= max:
        yield count
        count += 1

counter = count_up_to(5)

# Using next to get values lazily
print(next(counter))  # Output: 1
print(next(counter))  # Output: 2

1
2


In [65]:
# Writing 1 million lines into a text file named large_file.txt
with open('large_file.txt', 'w') as f:
    for i in range(1, 1000001):
        f.write(f"This is line {i}\n")

In [66]:
def read_large_file(file_path):
    with open(file_path, 'r') as file:
        for line in file:
            yield line.strip()

large_file_reader = read_large_file('large_file.txt')

counter = 0
for line in large_file_reader:
    # Your processing code here
    print(line)
    if counter > 10:
        break
    counter +=1

This is line 1
This is line 2
This is line 3
This is line 4
This is line 5
This is line 6
This is line 7
This is line 8
This is line 9
This is line 10
This is line 11
This is line 12
