#### whenever we start coding we always start with the importing of the libraries. My flow is that i will import the base libraries first and then will upload the other libraries when they are required.

## Importing the base libraries

In [1]:
import numpy as np # numpy is used to create the numpy arrays
import pandas as pd # pandas is used to work with the dataset
import matplotlib.pyplot  # is used to visualize the data
# to run the cell either you can click on the run button or you can use shift+enter in the notebook

## Importing the dataset

In [2]:
# you need a variable to store the dataset and then you need an appropriate function of the pandas library to import that particular file
# while importing the dataset we need to provide the comple path of the file. but since we have uploaded the file in our working we just need to give the file name
df = pd.read_csv('Data.csv')
print(df)

   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes


### EDA - we try to know our dataset. try to get the complete info, missing values, outliers, frequency string/categorical values.

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [4]:
df.shape

(10, 4)

In [5]:
# we have another function to know the extact number of missing values in our dataset
# isna function checks for the missing values and the sum fucntion accumulates the missing values
df.isna().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [6]:
print('mean and median of the age column')
print(df['Age'].mean())
print(df['Age'].median())
# no outliers are present

mean and median of the age column
38.77777777777778
38.0


In [7]:
print('mean and median of the Salary column')
print(df['Salary'].mean())
print(df['Salary'].median())
# rounding off the mean we have 64000 and the median is 61000 taking the difference:- 3000
# since this 3000 is very less compared to the rest of the values we can say that there are no outliers

mean and median of the Salary column
63777.77777777778
61000.0


#### Since our ML models can work only with the numeric values we need to know if our data set has categorical values present in the dataset

In [8]:
# we have function to know the frequency of occurence for these categories
print('frequency count for country column')
print(df["Country"].value_counts())
print('frequency count for purchased column')
print(df["Purchased"].value_counts())

frequency count for country column
France     4
Spain      3
Germany    3
Name: Country, dtype: int64
frequency count for purchased column
No     5
Yes    5
Name: Purchased, dtype: int64


# Data Pre-processing

## Extrac the x and y from the dataset. For this we use iloc function and the values function

In [9]:
x = df.iloc[:,:-1].values # values function is used to extract only values from the datafram and store them as ndarray
y = df.iloc[:,-1].values
print(x)
print(y)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


### Before we split the data set into training set and the test set we perform all option pre-processing steps if required

## Taking care of missing values

In [10]:
# To care of the missing data the SimpleImputer class of the sklearn.impute module is used
from sklearn.impute import SimpleImputer
# as you all know if we want to use the functions of class we need to create the object of the class and use that object to call the functions
# here we will the object of the SimpleImputer class to use its functions
imputer = SimpleImputer(missing_values=np.nan,strategy='mean') # startegy - 'median', 'most frequent(mode), 'constant'
x[: , 1:3] = imputer.fit_transform(x[: , 1:3])
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding the categorical data

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder' , OneHotEncoder(),[0])], remainder='passthrough')
x = ct.fit_transform(x)
print(x)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


## since y also have the categorical data in the form of yes and no, so we will encode it too

In [14]:
# Since the y has label type data we will labelencoder to encode it
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
print(y)

[0 1 0 0 1 1 0 1 0 1]


## feature scalling

In [16]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x = sc.fit_transform(x)
np.set_printoptions(suppress = True)
print(x)
# the data after scalling is stored in scientific notation to change it into a readable format
# we use np.set_printoptions(suppress = True)

[[ 1.22474487 -0.65465367 -0.65465367  0.75887436  0.74947325]
 [-0.81649658 -0.65465367  1.52752523 -1.71150388 -1.43817841]
 [-0.81649658  1.52752523 -0.65465367 -1.27555478 -0.89126549]
 [-0.81649658 -0.65465367  1.52752523 -0.11302384 -0.25320042]
 [-0.81649658  1.52752523 -0.65465367  0.17760889  0.        ]
 [ 1.22474487 -0.65465367 -0.65465367 -0.54897294 -0.52665688]
 [-0.81649658 -0.65465367  1.52752523  0.         -1.0735698 ]
 [ 1.22474487 -0.65465367 -0.65465367  1.34013983  1.38753832]
 [-0.81649658  1.52752523 -0.65465367  1.63077256  1.75214693]
 [ 1.22474487 -0.65465367 -0.65465367 -0.25834021  0.29371249]]


## Splitting the x and y into training set and test set

In [17]:
from sklearn.model_selection import train_test_split
# the train_test_split method is designed to split our x any inot training set and test set
# giving four output. While splitting the x and y it shuffles the rows
# to get the same shuffling for everyone we provide the shuffling state as the value if 'random_state' parameter
# we also need to give the size of either the train_size or test_size, by default the ration is 75:25
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 0)
print(x_train)
print(x_test)
print(y_train)
print(y_test)

[[-0.81649658  1.52752523 -0.65465367  0.17760889  0.        ]
 [ 1.22474487 -0.65465367 -0.65465367 -0.25834021  0.29371249]
 [-0.81649658 -0.65465367  1.52752523 -1.71150388 -1.43817841]
 [-0.81649658 -0.65465367  1.52752523  0.         -1.0735698 ]
 [ 1.22474487 -0.65465367 -0.65465367  1.34013983  1.38753832]
 [-0.81649658 -0.65465367  1.52752523 -0.11302384 -0.25320042]
 [ 1.22474487 -0.65465367 -0.65465367  0.75887436  0.74947325]
 [ 1.22474487 -0.65465367 -0.65465367 -0.54897294 -0.52665688]]
[[-0.81649658  1.52752523 -0.65465367 -1.27555478 -0.89126549]
 [-0.81649658  1.52752523 -0.65465367  1.63077256  1.75214693]]
[1 1 1 0 1 0 0 1]
[0 0]
