In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")

np.random.seed(42)

def createdata():
    data = {
        'Age': np.random.randint(18, 70, size=20),
        'Salary': np.random.randint(12000, 30000, size=20),  # Corrected salary range
        'purchased': np.random.choice([0, 1], size=20),
        'Gender': np.random.choice(['Male', 'Female'], size=20),
        'city': np.random.choice(['New York', 'San Francisco', 'Los Angeles'], size=20)  # Corrected capitalization for "Los Angeles"
    }
    df = pd.DataFrame(data)
    return df


df = createdata()
print(df)


    Age  Salary  purchased  Gender           city
0    56   14433          1  Female  San Francisco
1    69   17311          1  Female    Los Angeles
2    46   17051          1  Female    Los Angeles
3    32   18420          1  Female  San Francisco
4    60   29568          0  Female    Los Angeles
5    25   18396          1  Female       New York
6    38   20666          0  Female  San Francisco
7    56   14747          1  Female       New York
8    36   12189          1  Female       New York
9    40   15005          1  Female  San Francisco
10   28   13899          0  Female    Los Angeles
11   28   13267          1    Male       New York
12   41   29912          0    Male  San Francisco
13   53   23394          1  Female       New York
14   57   15556          0  Female       New York
15   41   15890          1  Female       New York
16   20   20838          0  Female       New York
17   39   26502          0  Female    Los Angeles
18   19   22627          1  Female       New York


In [2]:
df.loc[5,'Age'] = np.nan
df.loc[9,'Salary'] = np.nan
df.head(10)

Unnamed: 0,Age,Salary,purchased,Gender,city
0,56.0,14433.0,1,Female,San Francisco
1,69.0,17311.0,1,Female,Los Angeles
2,46.0,17051.0,1,Female,Los Angeles
3,32.0,18420.0,1,Female,San Francisco
4,60.0,29568.0,0,Female,Los Angeles
5,,18396.0,1,Female,New York
6,38.0,20666.0,0,Female,San Francisco
7,56.0,14747.0,1,Female,New York
8,36.0,12189.0,1,Female,New York
9,40.0,,1,Female,San Francisco


In [2]:
#handling missing values 
df['Age'].fillna(df['Age'].mean(), inplace=True) 
df['Salary'].fillna(df['Salary'].median(), inplace=True)  

print(df)

    Age  Salary  purchased  Gender           city
0    56   14433          1  Female  San Francisco
1    69   17311          1  Female    Los Angeles
2    46   17051          1  Female    Los Angeles
3    32   18420          1  Female  San Francisco
4    60   29568          0  Female    Los Angeles
5    25   18396          1  Female       New York
6    38   20666          0  Female  San Francisco
7    56   14747          1  Female       New York
8    36   12189          1  Female       New York
9    40   15005          1  Female  San Francisco
10   28   13899          0  Female    Los Angeles
11   28   13267          1    Male       New York
12   41   29912          0    Male  San Francisco
13   53   23394          1  Female       New York
14   57   15556          0  Female       New York
15   41   15890          1  Female       New York
16   20   20838          0  Female       New York
17   39   26502          0  Female    Los Angeles
18   19   22627          1  Female       New York


In [3]:
df_dropped = df.dropna()
df_dropped.head(10)

Unnamed: 0,Age,Salary,purchased,Gender,city
0,56,14433,1,Female,San Francisco
1,69,17311,1,Female,Los Angeles
2,46,17051,1,Female,Los Angeles
3,32,18420,1,Female,San Francisco
4,60,29568,0,Female,Los Angeles
5,25,18396,1,Female,New York
6,38,20666,0,Female,San Francisco
7,56,14747,1,Female,New York
8,36,12189,1,Female,New York
9,40,15005,1,Female,San Francisco


In [4]:
df_dropped.shape

(20, 5)

In [5]:
from sklearn.impute import KNNImputer
knn_imputer = KNNImputer(n_neighbors=3) 
df[['Age', 'Salary']] = knn_imputer.fit_transform(df[['Age', 'Salary']])

In [6]:
df['Age_missing'] =df['Age'].isnull().astype(int)
df.head(10)

Unnamed: 0,Age,Salary,purchased,Gender,city,Age_missing
0,56.0,14433.0,1,Female,San Francisco,0
1,69.0,17311.0,1,Female,Los Angeles,0
2,46.0,17051.0,1,Female,Los Angeles,0
3,32.0,18420.0,1,Female,San Francisco,0
4,60.0,29568.0,0,Female,Los Angeles,0
5,25.0,18396.0,1,Female,New York,0
6,38.0,20666.0,0,Female,San Francisco,0
7,56.0,14747.0,1,Female,New York,0
8,36.0,12189.0,1,Female,New York,0
9,40.0,15005.0,1,Female,San Francisco,0


In [7]:
#label encoding

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])
df.head()


Unnamed: 0,Age,Salary,purchased,Gender,city,Age_missing
0,56.0,14433.0,1,0,San Francisco,0
1,69.0,17311.0,1,0,Los Angeles,0
2,46.0,17051.0,1,0,Los Angeles,0
3,32.0,18420.0,1,0,San Francisco,0
4,60.0,29568.0,0,0,Los Angeles,0


In [8]:
df = pd.get_dummies(df,columns=['city'],drop_first=True)
df.head()

Unnamed: 0,Age,Salary,purchased,Gender,Age_missing,city_New York,city_San Francisco
0,56.0,14433.0,1,0,0,False,True
1,69.0,17311.0,1,0,0,False,False
2,46.0,17051.0,1,0,0,False,False
3,32.0,18420.0,1,0,0,False,True
4,60.0,29568.0,0,0,0,False,False


In [18]:
print(df.columns)
df['city'] = df[['city_New York', 'city_San Francisco']].idxmax(axis=1).map(
    {'city_New York': 'New York', 'city_San Francisco': 'San Francisco'}
)
city_mapping = {'New York': 1, 'San Francisco': 2, 'Los Angeles': 3}
df['city'] = df['city'].map(city_mapping)


Index(['Age', 'Salary', 'purchased', 'Gender', 'Age_missing', 'city_New York',
       'city_San Francisco', 'city'],
      dtype='object')


In [13]:
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Age                 20 non-null     float64
 1   Salary              20 non-null     float64
 2   purchased           20 non-null     int64  
 3   Gender              20 non-null     int64  
 4   Age_missing         20 non-null     int64  
 5   city_New York       20 non-null     bool   
 6   city_San Francisco  20 non-null     bool   
dtypes: bool(2), float64(2), int64(3)
memory usage: 972.0 bytes
None


In [20]:

print("Columns in DataFrame:", df.columns)


if 'city' not in df.columns:
    print("Reconstructing 'city' column from one-hot encoded columns...")
    df['city'] = df[['city_New York', 'city_San Francisco']].idxmax(axis=1).map(
        {'city_New York': 'New York', 'city_San Francisco': 'San Francisco'}
    )

# Ordinal Encoding
city_mapping = {'New York': 1, 'San Francisco': 2, 'Los Angeles': 3}
df['city'] = df['city'].map(city_mapping)


print(df.head())


Columns in DataFrame: Index(['Age', 'Salary', 'purchased', 'Gender', 'Age_missing', 'city_New York',
       'city_San Francisco', 'city'],
      dtype='object')
    Age   Salary  purchased  Gender  Age_missing  city_New York  \
0  56.0  14433.0          1       0            0          False   
1  69.0  17311.0          1       0            0          False   
2  46.0  17051.0          1       0            0          False   
3  32.0  18420.0          1       0            0          False   
4  60.0  29568.0          0       0            0          False   

   city_San Francisco  city  
0                True   NaN  
1               False   NaN  
2               False   NaN  
3                True   NaN  
4               False   NaN  
