### Imputing the Missing Values: SimpleImputer()

In [1]:
#importing libraries and imputing missing values using simpleimputer()
import pandas as pd
import numpy as np

In [2]:
data = {'one':pd.Series([1,2,5],
                            index=['a','b','e']),
        'two':pd.Series([1,2,3,4],
                            index=['a','b','c','d'])}
df=pd.DataFrame(data)
df.head()

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,,3.0
d,,4.0
e,5.0,


In [3]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(df)
imputed_data=imputer.transform(df.values)
print("After:\n",pd.DataFrame(imputed_data,columns=['one','two']))

After:
         one  two
0  1.000000  1.0
1  2.000000  2.0
2  2.666667  3.0
3  2.666667  4.0
4  5.000000  2.5




### Handling Categorical values: get_dummies

In [4]:
data={'Cust_id':pd.Series([1,2,3,4,5]),
      'Loan_type':pd.Series(['Home Loan',
                             'Personal Loan',
                             'Educated Loan',
                             'Home Loan',
                             'Credit Loan']),
      'Income':pd.Series(['30k','25k','15k','40k','35k'])}
loan_info=pd.DataFrame(data)
print(loan_info)

   Cust_id      Loan_type Income
0        1      Home Loan    30k
1        2  Personal Loan    25k
2        3  Educated Loan    15k
3        4      Home Loan    40k
4        5    Credit Loan    35k


In [5]:
loan_info=pd.get_dummies(loan_info,prefix_sep='-',)
print(loan_info)

   Cust_id  Loan_type-Credit Loan  Loan_type-Educated Loan  \
0        1                  False                    False   
1        2                  False                    False   
2        3                  False                     True   
3        4                  False                    False   
4        5                   True                    False   

   Loan_type-Home Loan  Loan_type-Personal Loan  Income-15k  Income-25k  \
0                 True                    False       False       False   
1                False                     True       False        True   
2                False                    False        True       False   
3                 True                    False       False       False   
4                False                    False       False       False   

   Income-30k  Income-35k  Income-40k  
0        True       False       False  
1       False       False       False  
2       False       False       False  
3       False   

### Handling categorical values: Label Encoder()

In [6]:
data={'cust_id':pd.Series([1,2,3,4,5]),
      'Loan_type':pd.Series(['Home Loan','Personal Loan','Education Loan',
                             'Home Loan','Credit Loan']),
      'Income': pd.Series(['30k','25k','15k','40k','35k'])}
loan_info = pd.DataFrame(data)
print(loan_info)

   cust_id       Loan_type Income
0        1       Home Loan    30k
1        2   Personal Loan    25k
2        3  Education Loan    15k
3        4       Home Loan    40k
4        5     Credit Loan    35k


In [7]:
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
loan_info_upd=loan_info.apply(labelencoder.fit_transform)# df.apply(labelencoder().fit_transform)
print(loan_info_upd)

   cust_id  Loan_type  Income
0        0          2       2
1        1          3       1
2        2          1       0
3        3          2       4
4        4          0       3


### Handling categorical values: One-hot Encoder()

In [8]:
loan_info

Unnamed: 0,cust_id,Loan_type,Income
0,1,Home Loan,30k
1,2,Personal Loan,25k
2,3,Education Loan,15k
3,4,Home Loan,40k
4,5,Credit Loan,35k


In [9]:
from sklearn.preprocessing import OneHotEncoder
onehotencoder=OneHotEncoder()
X=onehotencoder.fit_transform(loan_info).toarray()
print(X)

[[1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0.]]


In [10]:
data = {
    'Customer_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'],
    'Age': [25, 30, 22, 35, 28, 40, 45, 21, 32, 27],
    'City': ['New York', 'San Francisco', 'Los Angeles', 'Chicago', 'Houston', 'New York', 'Chicago', 'Los Angeles', 'San Francisco', 'Houston'],
    'Product_Category': ['Electronics', 'Clothing', 'Electronics', 'Clothing', 'Home', 'Electronics', 'Home', 'Clothing', 'Electronics', 'Home'],
    'Purchase_Amount': [500, 300, 600, 400, 700, 800, 350, 450, 550, 750]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Customer_id,Gender,Age,City,Product_Category,Purchase_Amount
0,1,Male,25,New York,Electronics,500
1,2,Female,30,San Francisco,Clothing,300
2,3,Male,22,Los Angeles,Electronics,600
3,4,Female,35,Chicago,Clothing,400
4,5,Male,28,Houston,Home,700
5,6,Female,40,New York,Electronics,800
6,7,Male,45,Chicago,Home,350
7,8,Female,21,Los Angeles,Clothing,450
8,9,Male,32,San Francisco,Electronics,550
9,10,Female,27,Houston,Home,750


In [11]:
# perform using pd.get_dummies
df_encoded_dummies=pd.get_dummies(df, columns=['Gender','City','Product_Category'])
df_encoded_dummies # results in true/false type so we convert back to dataframe
# convert result back to data getting error

Unnamed: 0,Customer_id,Age,Purchase_Amount,Gender_Female,Gender_Male,City_Chicago,City_Houston,City_Los Angeles,City_New York,City_San Francisco,Product_Category_Clothing,Product_Category_Electronics,Product_Category_Home
0,1,25,500,False,True,False,False,False,True,False,False,True,False
1,2,30,300,True,False,False,False,False,False,True,True,False,False
2,3,22,600,False,True,False,False,True,False,False,False,True,False
3,4,35,400,True,False,True,False,False,False,False,True,False,False
4,5,28,700,False,True,False,True,False,False,False,False,False,True
5,6,40,800,True,False,False,False,False,True,False,False,True,False
6,7,45,350,False,True,True,False,False,False,False,False,False,True
7,8,21,450,True,False,False,False,True,False,False,True,False,False
8,9,32,550,False,True,False,False,False,False,True,False,True,False
9,10,27,750,True,False,False,True,False,False,False,False,False,True


In [12]:
from sklearn.preprocessing import LabelEncoder
label_encoder=LabelEncoder()
df['City_LabelEncoded']=label_encoder.fit_transform(df['City'])
df

Unnamed: 0,Customer_id,Gender,Age,City,Product_Category,Purchase_Amount,City_LabelEncoded
0,1,Male,25,New York,Electronics,500,3
1,2,Female,30,San Francisco,Clothing,300,4
2,3,Male,22,Los Angeles,Electronics,600,2
3,4,Female,35,Chicago,Clothing,400,0
4,5,Male,28,Houston,Home,700,1
5,6,Female,40,New York,Electronics,800,3
6,7,Male,45,Chicago,Home,350,0
7,8,Female,21,Los Angeles,Clothing,450,2
8,9,Male,32,San Francisco,Electronics,550,4
9,10,Female,27,Houston,Home,750,1


### Scaling:StandardScaler()

In [13]:
import numpy as np
from sklearn import preprocessing
data=np.array([[-100.3],[27.5],[0],[-200.9],[1000]])
print('\nBefore scaling\n',data)

standard_scaler=preprocessing.StandardScaler()
scaled=standard_scaler.fit_transform(data)

print('\nAfter standard scaler\n',scaled)


Before scaling
 [[-100.3]
 [  27.5]
 [   0. ]
 [-200.9]
 [1000. ]]

After standard scaler
 [[-0.5646401 ]
 [-0.27077707]
 [-0.33401051]
 [-0.79595951]
 [ 1.96538718]]


# Scaling:MinMaxScaler()

In [14]:
from sklearn.preprocessing import MinMaxScaler
data=np.array([[-100.3],[27.5],[0],[-200.9],[1000]])
print('Before Scaling:',data)
minmax_scaler=preprocessing.MinMaxScaler(feature_range=(1,2))
scaled_max=minmax_scaler.fit_transform(data)
print('\nAfter Standard Scaler',scaled_max)

Before Scaling: [[-100.3]
 [  27.5]
 [   0. ]
 [-200.9]
 [1000. ]]

After Standard Scaler [[1.08377051]
 [1.19019069]
 [1.1672912 ]
 [1.        ]
 [2.        ]]


### Scaling:RobustScaler()

In [16]:
data=np.array([[-100.3],[27.5],[0],[-200.9],[1000]])
print('Before Scaling\n', data)
robust_scaler=preprocessing.RobustScaler()
scaled_robust=robust_scaler.fit_transform(data)
print('\nAfter standard scaler\n',scaled_robust)

Before Scaling
 [[-100.3]
 [  27.5]
 [   0. ]
 [-200.9]
 [1000. ]]

After standard scaler
 [[-0.78482003]
 [ 0.21517997]
 [ 0.        ]
 [-1.57198748]
 [ 7.82472613]]


### Feature Selection

In [17]:
import seaborn as sns
import matplotlib.pyplot as plt
#import the dataset
df=pd.read_csv('Car_sales.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Car_sales.csv'