In [1]:
# Exercise 1: Identifying and Handling Missing Data
import pandas as pd

# Sample dataset with missing values
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', None],
    'Age': [24, 30, None, 22, 35],
    'Salary': [48000, None, 57000, None, 60000]
}
df = pd.DataFrame(data)

# Filling missing values and dropping rows
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary'].fillna(df['Salary'].median(), inplace=True)
df.dropna(subset=['Name'], inplace=True)
print('After cleaning:\n', df)

After cleaning:
       Name    Age   Salary
0    Alice  24.00  48000.0
1      Bob  30.00  57000.0
2  Charlie  27.75  57000.0
3    David  22.00  57000.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].median(), inplace=True)


In [2]:
# Exercise 2: Standardizing Categorical Data
# Sample dataset with inconsistent categorical values
data = {
    'Product': ['Laptop', 'Laptop', 'Desktop', 'Tablet', 'Tablet'],
    'Category': ['Electronics', 'electronics', 'Electronics', 'Gadgets', 'gadgets']
}
df = pd.DataFrame(data)

# Standardize category values
df['Category'] = df['Category'].str.capitalize()
print('Standardized Data:\n', df)

Standardized Data:
    Product     Category
0   Laptop  Electronics
1   Laptop  Electronics
2  Desktop  Electronics
3   Tablet      Gadgets
4   Tablet      Gadgets


In [19]:
# Pratice Task 1: Identifying and Handling Missing Data
import pandas as pd

df = pd.read_csv('Titanic-Dataset.csv')

df.fillna({'Age': df['Age'].mean(), 'Fare': df['Fare'].median()}, inplace=True)
df.dropna(subset=['Name'], inplace=True)

print(df.head(10).to_string(index=False))

 PassengerId  Survived  Pclass                                                Name    Sex       Age  SibSp  Parch           Ticket    Fare Cabin Embarked
           1         0       3                             Braund, Mr. Owen Harris   male 22.000000      1      0        A/5 21171  7.2500   NaN        S
           2         1       1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38.000000      1      0         PC 17599 71.2833   C85        C
           3         1       3                              Heikkinen, Miss. Laina female 26.000000      0      0 STON/O2. 3101282  7.9250   NaN        S
           4         1       1        Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.000000      1      0           113803 53.1000  C123        S
           5         0       3                            Allen, Mr. William Henry   male 35.000000      0      0           373450  8.0500   NaN        S
           6         0       3                                    Moran, Mr.

In [14]:
 # Pratice Task 2: Implement data transformations to normalize numerical columns
import pandas as pd

df = pd.read_csv('Titanic-Dataset.csv')

df['Sex'] = df['Sex'].map({'male': 1, 'female': 0})
df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

print("After Normalization:")
print(df[['Name','Sex', 'Embarked']].head(10).to_string(index=False))

After Normalization:
                                               Name  Sex  Embarked
                            Braund, Mr. Owen Harris    1       0.0
Cumings, Mrs. John Bradley (Florence Briggs Thayer)    0       1.0
                             Heikkinen, Miss. Laina    0       0.0
       Futrelle, Mrs. Jacques Heath (Lily May Peel)    0       0.0
                           Allen, Mr. William Henry    1       0.0
                                   Moran, Mr. James    1       2.0
                            McCarthy, Mr. Timothy J    1       0.0
                     Palsson, Master. Gosta Leonard    1       0.0
  Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)    0       0.0
                Nasser, Mrs. Nicholas (Adele Achem)    0       1.0


In [None]:
# Pratice Task 3: Standardize Categorial Columns and Remove Duplicate
import pandas as pd
df = pd.read_csv('Titanic-Dataset.csv')

In [16]:
# Homework 1 Identifying and Handling Missing Data
import pandas as pd
df = pd.read_csv('amazon.csv')
pd.set_option('display.max_colwidth', 40)
pd.set_option('display.width', 100)

df['rating_count'] = pd.to_numeric(df['rating_count'], errors='coerce')
df['rating_count'].fillna(df['rating_count'].median(), inplace=True)

print(df[['product_name','rating','rating_count']].head(10).to_string(index=False))

                                                                                                                                                                                           product_name rating  rating_count
                                     Wayona Nylon Braided USB to Lightning Fast Charging and Data Sync Cable Compatible for iPhone 13, 12,11, X, 8, 7, 6, 5, iPad Air, Pro, Mini (3 FT Pack of 1, Grey)    4.2         293.0
      Ambrane Unbreakable 60W / 3A Fast Charging 1.5m Braided Type C Cable for Smartphones, Tablets, Laptops & other Type C devices, PD Technology, 480Mbps Data Sync, Quick Charge 3.0 (RCT15A, Black)    4.0         293.0
                                                               Sounce Fast Phone Charging Cable & Data Sync USB Cable Compatible for iPhone 13, 12,11, X, 8, 7, 6, 5, iPad Air, Pro, Mini & iOS Devices    3.9         293.0
boAt Deuce USB 300 2 in 1 Type-C & Micro USB Stress Resistant, Tangle-Free, Sturdy Cable with 3A Fast Charging & 480

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['rating_count'].fillna(df['rating_count'].median(), inplace=True)


In [14]:
# Homework 2 Implement data transformations to normalize numerical columns
import pandas as pd

df = pd.read_csv('amazon.csv')

df['discounted_price'] = pd.to_numeric(df['discounted_price'].str.replace('₹',''), errors='coerce')
df['actual_price'] = pd.to_numeric(df['actual_price'].str.replace('₹',''), errors='coerce')
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

df['discounted_price_norm'] = (df['discounted_price'] - df['discounted_price'].min()) / (df['discounted_price'].max() - df['discounted_price'].min())
df['actual_price_norm'] = (df['actual_price'] - df['actual_price'].min()) / (df['actual_price'].max() - df['actual_price'].min())
df['rating_norm'] = (df['rating'] - df['rating'].min()) / (df['rating'].max() - df['rating'].min())

print("After Normalization:")
print(df[['discounted_price_norm','actual_price_norm','rating_norm']].head(10).to_string(index=False))

After Normalization:
 discounted_price_norm  actual_price_norm  rating_norm
              0.375000                NaN     0.733333
              0.166667           0.322917     0.666667
              0.166667                NaN     0.633333
              0.302083           0.687500     0.733333
              0.119792           0.375000     0.733333
              0.114583                NaN     0.633333
              0.143365           0.479167     0.700000
              0.197917           0.270833     0.766667
              0.479167           1.000000     0.733333
              0.166667           0.270833     0.666667


In [17]:
# Homework 3 Standardize Categorial Columns and Remove Duplicate
df = pd.read_csv('amazon.csv')

df['product_name'] = df['product_name'].str.strip().str.lower()
df['category'] = df['category'].str.strip().str.lower()
df.drop_duplicates(inplace=True)

def shorten(text, limit=35):
    text = str(text)
    if len(text) > limit:
        return text[:limit] + '...'
    return text

df['product_name_short'] = df['product_name'].apply(shorten)
df['category_short'] = df['category'].apply(shorten)

cols = ['product_name_short', 'category_short', 'discounted_price', 'actual_price']

print(df[cols].head(10).to_string(index=False))

                    product_name_short                         category_short discounted_price actual_price
wayona nylon braided usb to lightni... computers&accessories|accessories&p...             ₹399       ₹1,099
ambrane unbreakable 60w / 3a fast c... computers&accessories|accessories&p...             ₹199         ₹349
sounce fast phone charging cable & ... computers&accessories|accessories&p...             ₹199       ₹1,899
boat deuce usb 300 2 in 1 type-c & ... computers&accessories|accessories&p...             ₹329         ₹699
portronics konnect l 1.2m fast char... computers&accessories|accessories&p...             ₹154         ₹399
ptron solero tb301 3a type-c data a... computers&accessories|accessories&p...             ₹149       ₹1,000
boat micro usb 55 tangle-free, stur... computers&accessories|accessories&p...          ₹176.63         ₹499
mi usb type-c cable smartphone (bla... computers&accessories|accessories&p...             ₹229         ₹299
tp-link usb wifi adapter for