In [383]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import re
import warnings
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import string
from nltk.corpus import stopwords
warnings.filterwarnings('ignore')


In [384]:
# Handling Missing Values

import pandas as pd
import numpy as np

# dataset with missing values
data = {
    "Name": ["Wajid", "Ali", "Mansoor", "Zahid", "Wali"],
    "Age": [25, np.nan, 30, 22, np.nan],
    "City": ["Lahore", "Karachi", np.nan, "Islamabad", "Quetta"],
    "Salary": [50000, 60000, np.nan, 55000, 58000]
}

df = pd.DataFrame(data)
print("Original Dataset:")
df


Original Dataset:


Unnamed: 0,Name,Age,City,Salary
0,Wajid,25.0,Lahore,50000.0
1,Ali,,Karachi,60000.0
2,Mansoor,30.0,,
3,Zahid,22.0,Islamabad,55000.0
4,Wali,,Quetta,58000.0


In [385]:
# for single value to be filled
df['City']=df['City'].fillna('Gilgit')
df.head()

Unnamed: 0,Name,Age,City,Salary
0,Wajid,25.0,Lahore,50000.0
1,Ali,,Karachi,60000.0
2,Mansoor,30.0,Gilgit,
3,Zahid,22.0,Islamabad,55000.0
4,Wali,,Quetta,58000.0


In [386]:
# compute median-based values first 
age_median = df['Age'].median()
Salary_median = df['Salary'].median()

# for multple values
df = df.fillna({
    'City': 'Gilgit',
    'Salary': Salary_median,
    'Age': age_median,   
       
})
df.head()


Unnamed: 0,Name,Age,City,Salary
0,Wajid,25.0,Lahore,50000.0
1,Ali,25.0,Karachi,60000.0
2,Mansoor,30.0,Gilgit,56500.0
3,Zahid,22.0,Islamabad,55000.0
4,Wali,25.0,Quetta,58000.0


In [387]:
# Replacing Values

df['City'] = df['City'].replace('Lahore', 'Gilgit')
df['Salary'] = df['Salary'].replace(0, df['Salary'].mean())

In [388]:
# Detecting/Filtering Missing Data

df[df['City'].isna()]    
df[df['Age'].notna()] 

Unnamed: 0,Name,Age,City,Salary
0,Wajid,25.0,Gilgit,50000.0
1,Ali,25.0,Karachi,60000.0
2,Mansoor,30.0,Gilgit,56500.0
3,Zahid,22.0,Islamabad,55000.0
4,Wali,25.0,Quetta,58000.0


In [389]:
# For rows

df = df[df['Name'] != 'Wajid']

In [390]:
df.head()

Unnamed: 0,Name,Age,City,Salary
1,Ali,25.0,Karachi,60000.0
2,Mansoor,30.0,Gilgit,56500.0
3,Zahid,22.0,Islamabad,55000.0
4,Wali,25.0,Quetta,58000.0


In [391]:
# for multple to drop

df = df.drop(df[(df['Name'] == 'Wajid') &
                (df['Age'] == 25.0) &
                (df['City'] == 'Lahore') &
                (df['Salary'] == 50000.0)].index)


In [392]:
df.columns

Index(['Name', 'Age', 'City', 'Salary'], dtype='object')

In [393]:
# drop columns for single columns 
df=df.drop(columns=['City'])


In [394]:
df.head()

Unnamed: 0,Name,Age,Salary
1,Ali,25.0,60000.0
2,Mansoor,30.0,56500.0
3,Zahid,22.0,55000.0
4,Wali,25.0,58000.0


In [395]:
# for Multple Coloumns

df = df.drop(['Age', 'Name'], axis=1)

In [396]:
df.head()

Unnamed: 0,Salary
1,60000.0
2,56500.0
3,55000.0
4,58000.0


In [397]:
# Interpolating Values

df = pd.DataFrame({
    'Day': [1, 2, 3, 4, 5, 6],
    'Temperature': [30, None, 32, None, 35, 36]
})

In [398]:
print("Before Interpolation:\n", df)

df['Temperature'] = df['Temperature'].interpolate()

print("\nAfter Interpolation:\n", df)

Before Interpolation:
    Day  Temperature
0    1         30.0
1    2          NaN
2    3         32.0
3    4          NaN
4    5         35.0
5    6         36.0

After Interpolation:
    Day  Temperature
0    1         30.0
1    2         31.0
2    3         32.0
3    4         33.5
4    5         35.0
5    6         36.0


In [399]:
# Forward/Backward Limit

df['Temperature'].interpolate(limit=1)   # fill only 1 NaN at a time


0    30.0
1    31.0
2    32.0
3    33.5
4    35.0
5    36.0
Name: Temperature, dtype: float64

In [400]:
# Interpolation on Time Series

date_rng = pd.date_range(start='2025-01-01', periods=6, freq='D')
df = pd.DataFrame({'Date': date_rng, 'Value': [10, None, 30, None, None, 60]})
df.set_index('Date', inplace=True)

df['Value'] = df['Value'].interpolate(method='time')


In [401]:
df.head()

Unnamed: 0_level_0,Value
Date,Unnamed: 1_level_1
2025-01-01,10.0
2025-01-02,20.0
2025-01-03,30.0
2025-01-04,40.0
2025-01-05,50.0


From above observaation:

interpolation estimates missing values using surrounding data points in a time-aware linear fashion.

In [402]:
# Convert to DateTime

import pandas as pd  

df = pd.DataFrame({
    'signup_date': ['2025-01-01', '2025-01-02', '2025-01-03'],
    'last_login': ['01-02-2025', '02-02-2025', '03-02-2025']  # Different format
})

# Convert both to datetime
df['signup_date'] = pd.to_datetime(df['signup_date'])
df['last_login'] = pd.to_datetime(df['last_login'], format='%d-%m-%Y')

print(df.dtypes)
df

# Point to note here: Converts string dates into real datetime objects
# now you can calculate differences, sorting, filtering by time.


signup_date    datetime64[ns]
last_login     datetime64[ns]
dtype: object


Unnamed: 0,signup_date,last_login
0,2025-01-01,2025-02-01
1,2025-01-02,2025-02-02
2,2025-01-03,2025-02-03


In [403]:
# Convert to Numeric Sometimes numbers come as strings ( with currency symbols, commas).

df = pd.DataFrame(
    {
        'Price': ['1000', '2500', '3000']
    }
)

# Remove commas and convert 
df['Price']=df['Price'].str.replace(',', '').astype(float)

print(df.dtypes)
df                                                

Price    float64
dtype: object


Unnamed: 0,Price
0,1000.0
1,2500.0
2,3000.0


In [404]:
print(type(df['Price'][0]))


<class 'numpy.float64'>


In [424]:
df = pd.DataFrame({
    "Age": [25, 30, 22, 28],
    "Salary": [50000, 60000, 55000, 58000],
    "City": ["Lahore", "Karachi", "Gilgit", "Islamabad"],
    "Gender": ["M", "F", "M", "F"]
})

df.dtypes

Age        int64
Salary     int64
City      object
Gender    object
dtype: object

In [425]:
# for categorical features

categorical_cols = df.select_dtypes(include=['object','category']).columns.tolist()
categorical_df = df[categorical_cols]
categorical_df


Unnamed: 0,City,Gender
0,Lahore,M
1,Karachi,F
2,Gilgit,M
3,Islamabad,F


In [430]:
# for Numerical features
numeric_cols = df.select_dtypes(include='number').columns.tolist()
print("Numeric Columns:", numeric_cols)

numeric_df = df[numeric_cols]
numeric_df


Numeric Columns: ['Age', 'Salary']


Unnamed: 0,Age,Salary
0,25,50000
1,30,60000
2,22,55000
3,28,58000


In [433]:
# for multiple columns ]

df = pd.DataFrame({
    'Age': ['23','34','22'],
    'Salary':['2300','34000','4300']
})

df=df.astype({'Age':'int', 'Salary':'float'})
df.dtypes

Age         int64
Salary    float64
dtype: object

In [434]:
print(type(df['Salary'][0]))

<class 'numpy.float64'>


In [None]:
from bs4 import BeautifulSoup

df = pd.DataFrame({
    'Review': [
        "<p>This is <b>awesome</b> product!</p>",
        "<div>Very <i>bad</i> experience</div>"
    ]
})


# Remove HTMl Tags

df['cleaned_Review`']=df['Review'].apply(lambda x: BeautifulSoup(x, "html.parser").get_text())

df 

Unnamed: 0,Review,cleaned_Review`
0,<p>This is <b>awesome</b> product!</p>,This is awesome product!
1,<div>Very <i>bad</i> experience</div>,Very bad experience


In [442]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

text = "This is an example of text cleaning with stopwords, This is me Wajid and i am from gilgit and this for removing stopword"

# Tokenize the text
words = word_tokenize(text)

# Remove stopwords
filtered = [w for w in words if w.lower() not in stopwords.words('english')]

("Before:", words)
("After:", filtered)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Wajid\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Wajid\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


('After:',
 ['example',
  'text',
  'cleaning',
  'stopwords',
  ',',
  'Wajid',
  'gilgit',
  'removing',
  'stopword'])

In [446]:
# Punctuation Removal

import string

text = "Hello!! How are you?? i hope, you doing good@.."

# Remove all punctuation
clean_text = text.translate(str.maketrans('', '', string.punctuation))

print(clean_text)


Hello How are you i hope you doing good


In [448]:
# Stemming 

from nltk.stem import PorterStemmer

Stemmer=PorterStemmer()
words=['running','ending','playing','fairness']
stem=[Stemmer.stem(w) for w in words]
print(stem)

['run', 'end', 'play', 'fair']


In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
words = ["running", "runner", "easily", "fairness"]
lemmas = [lemmatizer.lemmatize(w, pos='v') for w in words]  # 'v' for verb
(lemmas)

# Typically, the pipeline is:
# Text → Remove punctuation → Lowercase → Stopword removal → Stemming/Lemmatization → Vectorization

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Wajid\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['run', 'runner', 'easily', 'fairness']

In [455]:
# Since machine can not directly understand so we convert them to numbers

from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    "I love machine learning",
    "Machine learning is amazing",
]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names_out())  # Vocabulary
print(X.toarray())  # Word count matrix


['amazing' 'is' 'learning' 'love' 'machine']
[[0 0 1 1 1]
 [1 1 1 0 1]]


In [457]:
# TF–IDF Term Frequency – Inverse Document Frequency

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names_out())  # Vocabulary
(X.toarray())  # TF-IDF matrix


['amazing' 'is' 'learning' 'love' 'machine']


array([[0.        , 0.        , 0.50154891, 0.70490949, 0.50154891],
       [0.57615236, 0.57615236, 0.40993715, 0.        , 0.40993715]])

In [None]:
# Normalization & Standardization
import pandas as pd

df = pd.DataFrame({
    'age':[14,15,17,19],
    'Salary':[2000,300,400,500]
})

df

Unnamed: 0,age,Salary
0,14,2000
1,15,300
2,17,400
3,19,500


In [464]:
# Apply Normalization Min-Max Scaling:

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

print('df_normalized')
df_normalized


df_normalized


Unnamed: 0,age,Salary
0,0.0,1.0
1,0.2,0.0
2,0.6,0.058824
3,1.0,0.117647


In [465]:
# Standardization (Z-score Scaling)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_standardized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

df_standardized

Unnamed: 0,age,Salary
0,-1.1717,1.7231
1,-0.650945,-0.717958
2,0.390567,-0.574367
3,1.432078,-0.430775


Key Notes

Normalization → useful when you need values between [0,1] (like images, distances).

Standardization → useful when algorithm assumes Gaussian distribution (SVM, Logistic Regression, PCA).

In [482]:
import pandas as pd

# Sample dataset
data = {
    'Name': ['Ali', 'Ali', 'Aliyan','Aliyan','Sumera'],
    'Age': [23, 34, 23, 120, 222],
    'Salary':[50000, 20000, 40000, 70000, 60000]
}

df = pd.DataFrame(data)
print("Original Data:\n", df, "\n")

# Remove duplicates
df = df.drop_duplicates()
print("After Removing Duplicates:\n", df, "\n")


Original Data:
      Name  Age  Salary
0     Ali   23   50000
1     Ali   34   20000
2  Aliyan   23   40000
3  Aliyan  120   70000
4  Sumera  222   60000 

After Removing Duplicates:
      Name  Age  Salary
0     Ali   23   50000
1     Ali   34   20000
2  Aliyan   23   40000
3  Aliyan  120   70000
4  Sumera  222   60000 



In [None]:
# Using Z-Score for outliers
from scipy import stats

df_zscore = df[(stats.zscore(df['Age']) < 3)]  # remove values 3 std dev away
print(df_zscore)

     Name  Age  Salary
0     Ali   23   50000
1     Ali   34   20000
2  Aliyan   23   40000
3  Aliyan  120   70000
4  Sumera  222   60000


Key Notes

Duplicates can bias your dataset → always remove them.

Outliers:

Use IQR for small datasets.

Use Z-score for normally distributed data.

In some cases (fraud detection, anomaly detection) → keep outliers.

In [487]:
## Encoding categorical data

from sklearn.preprocessing import LabelEncoder

df = pd.DataFrame({
    'Name':['Ali','Wajid','sara','zara'],
    'City':['Lahore','Karachi','Gilgit','Kashmir']
})

# Apply label encoding

le = LabelEncoder()
df['City_encoded'] = le.fit_transform(df['City'])
df

Unnamed: 0,Name,City,City_encoded
0,Ali,Lahore,3
1,Wajid,Karachi,1
2,sara,Gilgit,0
3,zara,Kashmir,2


In [493]:
# One Hot Encoding


df = pd.DataFrame({
    'CITY':['Multan','Hyderabad','Pindi']
})

#apply one hot encoding
df_encoded=pd.get_dummies(df, columns=['CITY'], drop_first=True)
df_encoded = df_encoded.astype(int)
df_encoded

Unnamed: 0,CITY_Multan,CITY_Pindi
0,1,0
1,0,0
2,0,1


In [494]:
# Ordinal Encoding (when categories have order)

from sklearn.preprocessing import OrdinalEncoder

df = pd.DataFrame({
    'object':['small','medium','large','small']
})

# define order
encoder = OrdinalEncoder(categories=[['small', 'medium','large']])
df['object_encoded']=encoder.fit_transform(df[['object']])
df

Unnamed: 0,object,object_encoded
0,small,0.0
1,medium,1.0
2,large,2.0
3,small,0.0


Summary

Label Encoding → For ML models like Decision Trees.

One-Hot Encoding → For regression/linear models.

Ordinal Encoding → When order matters (e.g., Low < Medium < High).

In [None]:
import pandas as pd

# Example dataset
df = pd.DataFrame({
    'Text': [
        "I love AI and Machine Learning",
        "Data Science is powerful",
        "Python is great for coding"
    ]
})

# Add new feature: word count
df['Word_Count'] = df['Text'].apply(lambda x: len(x.split()))

df

# Key note: New feature (Word_Count) is added based on text length.

Unnamed: 0,Text,Word_Count
0,I love AI and Machine Learning,6
1,Data Science is powerful,4
2,Python is great for coding,5


In [498]:
## Date Parts (Datetime Example)

# Example dataset with dates
date_rng = pd.date_range(start='2025-01-01', periods=4, freq='D')
df = pd.DataFrame({'Date': date_rng})

# Extract useful features from date
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['Weekday'] = df['Date'].dt.day_name()

df

Unnamed: 0,Date,Year,Month,Day,Weekday
0,2025-01-01,2025,1,1,Wednesday
1,2025-01-02,2025,1,2,Thursday
2,2025-01-03,2025,1,3,Friday
3,2025-01-04,2025,1,4,Saturday


In [None]:
# Combine Features

# Salary per year of age (synthetic feature)
df2 = pd.DataFrame({
    'Age': [25, 30, 35],
    'Salary': [50000, 60000, 75000]
})

df2['Salary_per_Age'] = df2['Salary'] / df2['Age']
df2

# Created a new feature (Salary_per_Age) by combining columns.

Unnamed: 0,Age,Salary,Salary_per_Age
0,25,50000,2000.0
1,30,60000,2000.0
2,35,75000,2142.857143


Summary of Feature Engineering Examples:

Text Features → Word count, character count, keyword presence.

Date Features → Year, month, day, weekday, quarter.

Numeric Features → Ratios, differences, combinations (e.g., salary per age)

In [503]:
# Merging Datasets
# Note...
"""When working with real-world data,
we often have multiple datasets that need to be combined into one for analysis or modeling. 
Pandas provides powerful functions for this: concat, merge, and join"""

import pandas as pd

df1 = pd.DataFrame({
    'ID': [1,2,3],
    'Name':['Ali','Wajid','Karim']
})

df2 = pd.DataFrame({
    'ID':[3,4],
    'Name':['Wali','Atish']
})

# Combine 
df_concat = pd.concat([df1,df2], axis=0)
print(df_concat)

   ID   Name
0   1    Ali
1   2  Wajid
2   3  Karim
0   3   Wali
1   4  Atish


In [504]:
# To save after merging 
df_concat.to_csv('Merge_dataset.csv', index=False)
print('Saved Merge dataset to Merge_data.csv')

Saved Merge dataset to Merge_data.csv
