In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
dataset = pd.read_csv(r"C:\Users\patil\Desktop\Know IT CADC\Practical Machine Learning\Common_Folder\Day 2\Data.csv")
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


## Seperating Independent & Dependent values

In [3]:
# Seperating Independent columns in X & Dependent columns into Y using indexing
x = dataset.iloc[: , :-1].values
y = dataset.iloc[: , -1].values

print(x)
print(y)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


### Taking care of missing data

In [4]:
# To replace age and salary columns by corresponding mean value using sklearn
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# Which cloumns to change
imputer.fit(x[ : , 1:3])

# Overwriting old values with new values
x[ : , 1:3] = imputer.transform(x[ : , 1:3])

print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data

In [5]:
# To replace city column with 3 columns (dummy variables using sklearn)
# transformers = [(action, the class object who will do the object transformation,
# columns index on which to apply transformation)
# remainder = 'passthrough' will keep all other columns as it is, other wise it will keep only newly created columns]

In [6]:
# Same like pd.getdummies

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
# fit_transform does not return o/p in numpy array format it is array format
# and ML Model will need the data in numpy array, so conversion is necessary

x = np.array(ct.fit_transform(x))
ct

In [7]:
print(x)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


## Encoding the dependent variable

In [8]:
le = LabelEncoder()
y = le.fit_transform(y)

In [9]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


In [10]:
data = {
        'city':['spain', 'france', 'germany', 'spain', 'france', 'germany'],
        'age':[89,77,56,67,66,77],
        'gender':['female', 'male', 'male', 'male', 'female', np.NaN],
        'review':['good', 'good', 'bad', 'good', 'bad', 'good'],
        'education':['UG', 'PG', 'PHD', 'PG', 'UG', 'PHD'],
        'purchase':['yes', 'yes', 'no', 'yes', 'no', 'yes']
        }

In [11]:
df = pd.DataFrame(data)
df

Unnamed: 0,city,age,gender,review,education,purchase
0,spain,89,female,good,UG,yes
1,france,77,male,good,PG,yes
2,germany,56,male,bad,PHD,no
3,spain,67,male,good,PG,yes
4,france,66,female,bad,UG,no
5,germany,77,,good,PHD,yes


In [12]:
ohe = OneHotEncoder(drop='first', sparse=False)
x = df.iloc[: ,0].values

ohe.fit(x.reshape(-1, 1))
x = ohe.transform(x.reshape(-1, 1))



In [13]:
x

array([[0., 1.],
       [0., 0.],
       [1., 0.],
       [0., 1.],
       [0., 0.],
       [1., 0.]])

In [14]:
# only taking column which contains null value in g
g = df.iloc[: ,2].values

# filling null values with most_frequent(mode)
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer.fit(g.reshape(-1, 1))    # reshaping 1D array to 2D array

g = imputer.transform(g.reshape(-1, 1))
g

array([['female'],
       ['male'],
       ['male'],
       ['male'],
       ['female'],
       ['male']], dtype=object)

## Splitting dataset into Train & Test

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

ValueError: Found input variables with inconsistent numbers of samples: [6, 10]

In [None]:
x_train

In [None]:
sc = StandardScaler()
x_train[: , 3:]