## Types of Algorithms

### Supervised learning
* classification
* regression

### Unsupervised learning
* Clustering

### Reinforced learning


# intro to regression and classification

## linear regresssion

In [63]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import train_test_split


data = pd.read_csv('beer-servings.csv')
data.head()


import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder






data.head()


Unnamed: 0.1,Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,0,Afghanistan,0.0,0.0,0.0,0.0,Asia
1,1,Albania,89.0,132.0,54.0,4.9,Europe
2,2,Algeria,25.0,0.0,14.0,0.7,Africa
3,3,Andorra,245.0,138.0,312.0,12.4,Europe
4,4,Angola,217.0,57.0,45.0,5.9,Africa


In [7]:
data = data.iloc[:,1:]
data.head()


num_df = data.select_dtypes(include = "number")
cat_df = data.select_dtypes(include = "object_")

num_df
cat_df

Unnamed: 0,country,continent
0,Afghanistan,Asia
1,Albania,Europe
2,Algeria,Africa
3,Andorra,Europe
4,Angola,Africa
...,...,...
188,Venezuela,South America
189,Vietnam,Asia
190,Yemen,Asia
191,Zambia,Africa


In [9]:
num_cols = num_df.columns.tolist()
print(num_cols)

for col in num_cols:
    num_df[col] = num_df[col].fillna(num_df[col].median())

['beer_servings', 'spirit_servings', 'wine_servings', 'total_litres_of_pure_alcohol']


In [10]:
def replace_outliers(df,column_name):
    q1 = df[column_name].quantile(0.25)
    q3 = df[column_name].quantile(0.75)
    iqr = q3 - q1
    upper_bond = q3 + 1.5*iqr
    lower_bond = q1 - 1.5*iqr
    df[column_name] = df[column_name].clip(upper = upper_bond)  # to replace value above upper bomd to upper bond
    df[column_name] = df[column_name].clip(lower = lower_bond)  # replace with lower bond
    return df[column_name]

In [11]:
for col in num_cols:
    num_df[col] = replace_outliers(num_df,col)

In [17]:
from sklearn.preprocessing import StandardScaler
std_scalar = StandardScaler()
num_df = std_scalar.fit_transform(num_df)

num_df
num_df = pd.DataFrame(num_df,columns=num_cols)
num_df

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
0,-1.056880,-0.971267,-0.723136,-1.264356
1,-0.151713,0.608759,0.302520,0.042922
2,-0.802619,-0.971267,-0.457225,-1.077602
3,1.434871,0.680578,2.049934,2.043856
4,1.150100,-0.288983,0.131578,0.309713
...,...,...,...,...
188,-0.283928,0.225722,-0.666155,0.789937
189,0.072036,-0.947327,-0.704142,-0.730773
190,-0.995857,-0.971267,-0.723136,-1.237677
191,-0.731427,-0.743839,-0.647161,-0.597378


In [13]:
cat_df
cat_columns = cat_df.columns.to_list()


In [14]:
for col in cat_columns:
    encoder = LabelEncoder()
    cat_df[col] = encoder.fit_transform(cat_df[col])


cat_df    


Unnamed: 0,country,continent
0,0,1
1,1,2
2,2,0
3,3,2
4,4,0
...,...,...
188,188,5
189,189,1
190,190,1
191,191,0


In [18]:
final_df1 = pd.concat([num_df,cat_df],axis = 1)
final_df1

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,country,continent
0,-1.056880,-0.971267,-0.723136,-1.264356,0,1
1,-0.151713,0.608759,0.302520,0.042922,1,2
2,-0.802619,-0.971267,-0.457225,-1.077602,2,0
3,1.434871,0.680578,2.049934,2.043856,3,2
4,1.150100,-0.288983,0.131578,0.309713,4,0
...,...,...,...,...,...,...
188,-0.283928,0.225722,-0.666155,0.789937,188,5
189,0.072036,-0.947327,-0.704142,-0.730773,189,1
190,-0.995857,-0.971267,-0.723136,-1.237677,190,1
191,-0.731427,-0.743839,-0.647161,-0.597378,191,0


In [34]:
y = final_df1["total_litres_of_pure_alcohol"]
X = final_df1.drop("total_litres_of_pure_alcohol",axis = 1)

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
reg_lin = LinearRegression()
reg_lin.fit(X_train,y_train)
y_pred = reg_lin.predict(X_test)
y_pred

array([ 1.93862402,  2.04135554, -1.01016055, -1.07404501,  0.87051585,
        1.59264491, -0.99597712, -0.85486762, -0.11195542,  0.44987474,
        2.2266087 , -1.00197681,  1.67722153, -0.40215607, -1.06835123,
       -0.93489365,  1.95873733, -0.91494137, -0.94045541,  0.57083053,
       -0.74857203, -1.05102182,  0.93696761,  1.2782916 , -0.79814585,
        0.51769145, -0.94697412,  1.2264858 , -0.87696152,  0.23491263,
       -0.87645609, -0.43624   , -0.51026634,  0.06811789,  0.4538047 ,
        1.57435541,  1.60090433,  1.99676968,  1.30867187])

In [35]:
mse = mean_squared_error(y_test,y_pred)
mse

0.09009354345323532

In [36]:
r2 = r2_score(y_test,y_pred=y_pred)
r2


0.9305871164150155

In [30]:
reg_lin.coef_


array([ 0.44306207,  0.38191279,  0.36338441,  0.00066096, -0.05811375])

## polynomial Regression

In [43]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=3,include_bias=True)

X_poly = poly.fit_transform(X_train)


model = LinearRegression()
model.fit(X_poly,y_train)

X_test_poly = poly.transform(X_test)
y_test_pred = model.predict(X_test_poly)


mse1 = mean_squared_error(y_test,y_test_pred)
r = r2_score(y_test,y_test_pred)


print(mse1)
print(r)

0.38402567440488194
0.704126084851489


In [47]:
result_df = pd.DataFrame(columns = ['model','metric','value'])
result_df.loc[len(result_df)] = ['linear regression','mse',mse]
result_df.loc[len(result_df)] = ['linear regression','r2',r2]
result_df.loc[len(result_df)] = ['polynomial regression','mse',mse1]
result_df.loc[len(result_df)] = ['polynomial regression','r2',r]
result_df

Unnamed: 0,model,metric,value
0,linear regression,mse,0.090094
1,linear regression,r2,0.930587
2,polynomial regression,mse,0.384026
3,polynomial regression,r2,0.704126


In [None]:
from sklearn.linear_model import Ridge

rd1 = Ridge()
rd1.fit(X_train,y_train)
y_pred = rd1.predict(X_test)
y_pred

mse2 = mean_squared_error(y_test,y_pred)
R1 = r2_score(y_test,y_pred)
mse2

0.08960142463396192

In [None]:
result_df.loc[len(result_df)] = ['linear regression after ridge','mse',mse2]
result_df.loc[len(result_df)] = ['linear regression after ridge','r2',R1]



In [None]:
from sklearn.linear_model import Lasso

l = Lasso()
l.fit(X_train,y_train)
y_pred = l.predict(X_test)
y_pred

mse3 = mean_squared_error(y_test,y_pred)
R2 = r2_score(y_test,y_pred)

result_df.loc[len(result_df)] = ['linear regression after lasso','mse',mse3]
result_df.loc[len(result_df)] = ['linear regression after lasso','r2',R2]


In [58]:
result_df

Unnamed: 0,model,metric,value
0,linear regression,mse,0.090094
1,linear regression,r2,0.930587
2,polynomial regression,mse,0.384026
3,polynomial regression,r2,0.704126
4,linear regression after ridge,mse,0.089601
5,linear regression after ridge,r2,0.930966
6,linear regression after lasso,mse,1.334181
7,linear regression after lassso,r2,-0.027925


In [61]:
from sklearn.linear_model import ElasticNet

E = ElasticNet()
E.fit(X_train,y_train)

y_pred = E.predict(X_test)

mse4 = mean_squared_error(y_test,y_pred)
R3 = r2_score(y_test,y_pred)

result_df.loc[len(result_df)] = ['linear regression after ElasticNet','mse',mse4]
result_df.loc[len(result_df)] = ['linear regression after ElasticNet','r2',R3]

In [62]:
result_df

Unnamed: 0,model,metric,value
0,linear regression,mse,0.090094
1,linear regression,r2,0.930587
2,polynomial regression,mse,0.384026
3,polynomial regression,r2,0.704126
4,linear regression after ridge,mse,0.089601
5,linear regression after ridge,r2,0.930966
6,linear regression after lasso,mse,1.334181
7,linear regression after lassso,r2,-0.027925
8,linear regression after ElasticNet,mse,0.968439
9,linear regression after ElasticNet,r2,0.253863
