# Pandas

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [24, 27, 22, 32],
    'Pet': ['Cat', 'Dog', 'Fish', 'Bird']
})

In [None]:
df

# Inplace vs Copy

In [None]:
df.set_index('Name')

In [None]:
df

In [None]:
df.set_index('Name', inplace=True)

In [None]:
df

## iloc

In [None]:
df

In [None]:
df.iloc[0]

In [None]:
df.iloc[0,2]

In [None]:
df.iloc[0:2]

## loc

In [None]:
# Setting 'Name' as the index
df.set_index('Name', inplace=True)

In [None]:
df

In [None]:
df.loc['Alice']

## Advanced Indexing

In [None]:
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [24, 27, 22, 32],
    'Pet': ['Cat', 'Dog', 'Fish', 'Bird']
})

In [None]:
df

In [None]:
# Create a boolean array using NumPy
boolean_array = np.array([True, False, True, False])

In [None]:
df.iloc[boolean_array]

In [None]:
df.iloc[df['Age'] > 25]

In [None]:
df.loc[df['Age'] > 25]

In [None]:
df.loc[df['Pet'].isin(['Cat', 'Dog'])]

## apply

In [None]:
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [24, 27, 22, 32],
    'Pet': ['Cat', 'Dog', 'Fish', 'Bird']
})
df

In [None]:
df['Age_squared'] = df['Age'].apply(lambda x: x ** 2)
df

In [None]:
def square_age(age):
    return age ** 2

In [None]:
df['Age_squared_2'] = df['Age'].apply(square_age)
df

## a tricky question

In [None]:
df1 = pd.DataFrame(np.arange(9).reshape((3, 3)),
                  columns=["a", "b", "c"])
df2 = pd.DataFrame(np.arange(12).reshape((3, 4)),
                  columns=["a", "b", "c", "d"])

In [None]:
df1

In [None]:
df2

In [None]:
df1 + df2

# Data Cleaning

## Missing Data

In [None]:
# Create a DataFrame with missing data
data = {'Name': ['Alice', 'Bob', 'Catherine', 'David', 'Eve', 'Frank'],
        'Age': [34, None, 28, 45, 23, 32],
        'Gender': ['Female', 'Male', 'Female', None, 'Female', None],
        'Income': [50000, 45000, None, 60000, 35000, 40000]}
df = pd.DataFrame(data)

# Save to CSV
df.to_csv('missing_data.csv', index=False)

In [None]:
df = pd.read_csv('missing_data.csv')
df

In [None]:
df_dropna = df.dropna()
df_dropna

In [None]:
df_fill_zero = df.fillna(0)
df_fill_zero

In [None]:
df_forward_fill = df.fillna(method='ffill')
df_forward_fill

In [None]:
df_fill_mean = df.fillna(df.mean(numeric_only=True))
df_fill_mean

## Inconsistant Data (extra feature)

In [None]:
# Adding inconsistent data to the DataFrame
data = {'Name': ['Alice', 'Bob', 'Catherine', 'David', 'Eve', 'Frank', 'Grace'],
        'Age': [34, None, 28, 45, 23, 32, 40],
        'Gender': ['Female', 'Male', 'Female', None, 'Female', None, 'Male'],
        'Income': [50000, 45000, None, 60000, 35000, 40000, 42000]}
df = pd.DataFrame(data)
df.loc[df['Name'] == 'Grace', 'ExtraColumn'] = 'ExtraColumnValue'

# Save to CSV
df.to_csv('inconsistent_data.csv', index=False)

In [None]:
df = pd.read_csv('inconsistent_data.csv')
df

In [None]:
df_drop = df.dropna(axis=0, how='all', subset=['ExtraColumn'])
df_drop

In [None]:
df_standardize = df[['Name', 'Age', 'Gender', 'Income']]
df_standardize

In [None]:
df['ExtraColumn'].fillna('DefaultValue', inplace=True)
df

## Inconsistant Data Type

In [None]:
# Adding inconsistent data types to the DataFrame
data = {'Name': ['Alice', 'Bob', 'Catherine', 'David', 'Eve', 'Frank', 'Grace'],
        'Age': [34, None, 'male', 4, 23, 32, 40],
        'Gender': ['Female', 1.1, 'Female', None, 'Female', None, 'Male'],
        'Income': [50000, 45000, None, 60000, '35000.0', 40000, "42000"]}
df = pd.DataFrame(data)

# Save to CSV
df.to_csv('inconsistent_datatypes.csv', index=False)

In [None]:
df = pd.read_csv('inconsistent_datatypes.csv')
df

In [None]:
df.dtypes

In [None]:
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
df

In [None]:
df['Age'] = pd.to_numeric(df['Age'])

# Generator

In [None]:
def count_up_to(max):
    count = 1
    while count <= max:
        yield count
        count += 1

counter = count_up_to(5)

# Using next to get values lazily
print(next(counter))  # Output: 1
print(next(counter))  # Output: 2

In [None]:
# Writing 1 million lines into a text file named large_file.txt
with open('large_file.txt', 'w') as f:
    for i in range(1, 1000001):
        f.write(f"This is line {i}\n")

In [None]:
def read_large_file(file_path):
    with open(file_path, 'r') as file:
        for line in file:
            yield line.strip()

large_file_reader = read_large_file('large_file.txt')

counter = 0
for line in large_file_reader:
    # Your processing code here
    print(line)
    if counter > 10:
        break
    counter +=1