https://github.com/campusx-official/100-days-of-machine-learning <br>
https://github.com/campusx-official/100-days-of-machine-learning/blob/main/day28-column-transformer/day28.ipynb<br>
https://github.com/krishnaik06/Pipelines-Using-Sklearn/blob/master/SklearnPipeline.ipynb

## Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [None]:
numeric = [cname for cname in x.columns if x[cname].dtype in ['int64', 'float64']]
categoric = [cname for cname in x.columns if x[cname].dtype == 'object']

# scaling numericals
numerical_transformer = Pipeline(steps = [
    ('scaler', RobustScaler())
])

# one-hot encoding categorical
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# bundle preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', numerical_transformer, numeric),
        ('categorical', categorical_transformer, categoric),
      ])

In [None]:
from sklearn.model_selection import cross_val_predict
y_pred = cross_val_predict(pipe, x, y, cv=10)

In [None]:
df= pd.get_dummies(df, columns=[])

In [None]:
weather['RainToday'] = weather['RainToday'].map({'No':0, 'Yes':1})

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif["Variable"] = x.columns
vif["VIF"] = [variance_inflation_factor(x.values, i) for i in range(x.shape[1])]
vif = vif[vif['Variable']!='intercept']
vif

## Scaling

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
iris_scaled = pd.DataFrame(iris)
mscaler = MinMaxScaler()
for col in ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']:
    iris_scaled[col]=mscaler.fit_transform(iris_scaled[col].values.reshape(-1,1))
iris_scaled

In [None]:
sc_x = StandardScaler()
x_trainscaled=sc_x.fit_transform(x_train)
x_testscaled=sc_x.transform(x_test)

## Regression:

In [2]:
from sklearn.linear_model import LinearRegression

In [None]:
reg = LinearRegression()
reg.fit(x_train, y_train)
reg.coef_
y_pred = reg.predict(x_test)

In [None]:
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
logreg.coef_
logreg.score(x_test, y_test)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

poly = PolynomialFeatures(degree=6, include_bias=False)
poly_features = poly.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(poly_features, y, test_size=0.3, random_state=42)

poly_reg_model = LinearRegression()
poly_reg_model.fit(x_train, y_train)
poly_reg_y_predicted = poly_reg_model.predict(x_test)

In [None]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

rigmodel = Ridge(alpha = 0.1, normalize = False)
rigmodel.fit(x_train, y_train)
  
y_pred1 = rigmodel.predict(x_test)
  
score1 = rigmodel.score(x_test, y_test)


lasmodel = Lasso(alpha = 0.1, normalize=False)
lasmodel.fit(x_train, y_train)

y_pred = lasmodel.predict(x_test)
  
score2 = lasmodel.score(x_test, y_test)

## Classification

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
neigh = KNeighborsClassifier(n_neighbors=10)
neigh.fit(x2_train, y2_train)
y2_pred2 = neigh.predict(x_test)
neigh.score(x2_test, y2_test)

In [None]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz  
from IPython.display import Image  
import pydotplus
import six
import sys
sys.modules['sklearn.externals.six'] = six
from six import StringIO

clf = DecisionTreeClassifier()
params = [{'criterion': ["gini", "entropy"],
         'max_depth': [2,3,4,5,6,7,8,9,10]}]
gs_tree = GridSearchCV(clf,param_grid=params,scoring='roc_auc',cv=10)
gs_tree.fit(x, y)
gs_tree.best_params_

In [None]:
dot_data = StringIO()
export_graphviz(clf1, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,feature_names = x.columns,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('credit.png')
Image(graph.create_png())

In [None]:
print(tree.export_text(clf1,feature_names = list(x.columns)))

In [None]:
from sklearn.neural_network import MLPClassifier

# MLP classifier with 3 layers of 13 neurons and max 500 iterations
clf = MLPClassifier(hidden_layer_sizes=(13,13,13),activation="relu",max_iter=500).fit(x_trainscaled, y_train)
y_pred=clf.predict(x_testscaled)
clf.score(x_testscaled, y_test)

weights, bias = clf.coefs_, clf.intercepts_

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

#using grid search to tune hyper parameters
params = {
    'C' : list(range(1,11)),
    'kernel' : ['linear', 'poly', 'rbf'],
    'gamma' : ['scale', 'auto']
}

grid_search = GridSearchCV(SVC(random_state=105), params, cv=10)

grid_search.fit(x_train, y_train)

grid_search.best_params_

# Model with best parameters
model = grid_search.best_estimator_

model.fit(x_train, y_train)

# Performance metrics of model
print("Performance metrics:")
print(classification_report(y_test, model.predict(x_test)))

## Scores

In [3]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [None]:
np.sqrt(mean_squared_error(y_pred,y_test)) #RMSE
r2_score(y_test,y_pred)

In [None]:
cm = pd.DataFrame(metrics.confusion_matrix(y_test, y_pred), index=['True_0', 'True_1'], columns=['Predicted_0','Predicted_1'])

In [None]:
tn = cm[0,0]
fp = cm[0,1]
fn = cm[1,0]
tp = cm[1,1]

acc = (tp+tn)/(tp+tn+fp+fn)
tpr = tp/(fn+tp)
tnr = tn/(tn+fp)

print("Accuracy = ", acc)
print("Sensitivity = ", tpr)
print("Specificity = ", tnr)

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
cv = KFold(n_splits=10, random_state=105, shuffle=True)

accuracy = cross_val_score(pipe, x, y, cv=cv, scoring='accuracy')
np.mean(accuracy)

precision = cross_val_score(pipe, x, y, cv=cv, scoring='precision')
np.mean(precision)

recall = cross_val_score(pipe, x, y, cv=cv, scoring='recall')
np.mean(recall)

In [None]:
from sklearn.metrics import plot_roc_curve, roc_auc_score
pipe.fit(x, y)
plot_roc_curve(pipe, x, y)

roc_auc_score(y,y_pred)

## Visualisations

In [None]:
sns.FacetGrid(iris, hue ="Species",height = 6).map(plt.scatter,'SepalWidthCm','PetalWidthCm').add_legend()

In [None]:
sns.pairplot(iris[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species']], hue="Species")

In [None]:
df.boxplot()

In [None]:
#Visualising age using histogram
plt.title('Distribution of Age')
plt.hist(diabetes_age)
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
#Tabulation of Class Label
diabetes_classLabels = pd.DataFrame(diabetes['class'].value_counts())
diabetes_classLabels
#Visualisation of Class labels
sns.countplot(x='class',data=diabetes).set(title="Countplot for Class Label Distribution")
plt.show()