In [20]:
import pandas as pd
import numpy as np

data = {'Name': ['Alice', 'Bob', 'bob', 'David'],
        'Age': [45, np.nan, 35,55],
        'City': ['NY', 'LA', np.nan, 'Chicago']}

df = pd.DataFrame(data)




# Filling missing values
df['Age'].fillna(df['Age'].mean(), inplace=True)  # Fill age with mean
df['City'].fillna('salme', inplace=True)  # Fill missing city with 'Unknown'

print(df)


    Name   Age     City
0  Alice  45.0       NY
1    Bob  45.0       LA
2    bob  35.0    salme
3  David  55.0  Chicago


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)  # Fill age with mean
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['City'].fillna('salme', inplace=True)  # Fill missing city with 'Unknown'


In [22]:
import pandas as pd
import numpy as np
df = pd.DataFrame({ 'Name': ['Alice', 'Bob', 'Bob', 'Charlie']})
df.drop_duplicates(inplace=True)
print(df)

      Name
0    Alice
1      Bob
3  Charlie


In [29]:
from scipy import stats

df = pd.DataFrame({'Salary': [3000, 3200, 3100, 50000,9500,99999999999]})
df = df[(np.abs(stats.zscore(df['Salary'])) < 3)]  # Remove extreme outliers
print(df)


        Salary
0         3000
1         3200
2         3100
3        50000
4         9500
5  99999999999


In [30]:
from sklearn.preprocessing import LabelEncoder

df = pd.DataFrame({'Category': ['Red', 'Blue', 'Green']})
encoder = LabelEncoder()
df['Category'] = encoder.fit_transform(df['Category'])
print(df)


   Category
0         2
1         0
2         1


In [32]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Sample DataFrame
df = pd.DataFrame({'Category': ['Red', 'Blue', 'Green', 'Red', 'Green', 'Blue']})

# 1️⃣ Label Encoding
label_encoder = LabelEncoder()
df['Label_Encoded'] = label_encoder.fit_transform(df['Category'])

# 2️⃣ One-Hot Encoding using sklearn's OneHotEncoder
one_hot_encoder = OneHotEncoder(sparse_output=False)  # sparse=False returns a NumPy array
one_hot_encoded = one_hot_encoder.fit_transform(df[['Category']])

# Convert One-Hot Encoding result to a DataFrame
df_one_hot = pd.DataFrame(one_hot_encoded, columns=one_hot_encoder.get_feature_names_out(['Category']))

# 3️⃣ Merging One-Hot Encoded Data with Original DataFrame
df_final = pd.concat([df, df_one_hot], axis=1)

# Display the final DataFrame
print(df_final)


  Category  Label_Encoded  Category_Blue  Category_Green  Category_Red
0      Red              2            0.0             0.0           1.0
1     Blue              0            1.0             0.0           0.0
2    Green              1            0.0             1.0           0.0
3      Red              2            0.0             0.0           1.0
4    Green              1            0.0             1.0           0.0
5     Blue              0            1.0             0.0           0.0


In [35]:
import pandas as pd

# Customer details
df1 = pd.DataFrame({'ID': [1, 2, 3], 'Name': ['Alice', 'Bob', 'Charlie']})

# Transaction details
df2 = pd.DataFrame({'ID': [1, 2, 3], 'Purchase': [500, 700, 1200]})

# Merge datasets on 'ID'
df_merged1 = pd.merge(df1, df2, on='ID')
# Feature Engineering: Creating 'Loyalty_Score'
df_merged['Loyalty_Score'] = df_merged['Purchase'] / df_merged['Purchase'].max()

print(df_merged)





   ID     Name  Purchase  Loyalty_Score
0   1    Alice       500       0.416667
1   2      Bob       700       0.583333
2   3  Charlie      1200       1.000000
