# Cheatsheet for sklearn

Content:

1. Classification Problems  

2. Regression Problems 

3. Encoding, Normalization, Dimension Reduction, StandardScaler ,etc...

In [None]:
#General imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
# Define the dependend and independend variables
X = df[["Input_1", "Input_2"]]
y = df["Target"]

# Split your dataset into a train and test data set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8)

In [None]:
# define model
model1 = GaussianNB()

# train the model on the training data
model1.fit(X_train, y_train)

Solution of the Naive Bayes is not the best model, if:  
- the interdependence of the independend variables is too high (check correlation)
- the independend variables are not normal distributed (see histplot)

Variants:
- MultinomialNB

## Support Vector Machine (SVM)  - including GridSearchCV & RepeatedKFold

In [None]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV. RepeatedKFold

GridSearchCV is used to find the best hyperparameters  
RepeatedKFold is used to do multiple train/test splits in the training set and summarize the results to improve fitting

In [None]:
param_grid = {
    'kernel': ['rbf', 'linear', 'poly']
    'C' = [0.01,0.05,0.1,0.5,1,5,10]
    'gamma' = [0.01,0.05,0.1,0.5,1,5,10]
}
# Attention: The poly-kernel is very time-intensive

In [None]:
# Define the dependend and independend variables
X = df[["Input_1", "Input_2"]]
y = df["Target"]

# Split your dataset into a train and test data set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8)

In [None]:
# define model
model1 = LinearSVC(param_grid = param_grid, cv = RepeatedKFold())

# train the model on the training data
model1.fit(X_train, y_train)
print(model1.best_params_)
# Attention: If the best_params are located at the edges of the chosen parameters, check further parameters

In [None]:
print(model1.best_score_)
print(model1.score(X_test, y_test))

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# Define the dependend and independend variables
X = df[["Input_1", "Input_2"]]
y = df["Target"]

# Split your dataset into a train and test data set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8)

In [None]:
model2 = LinearRegression()
model2.fit(X_train, y_train)

In [None]:
print(model2.coef_)
print(model2.intercept_)

In [None]:
print(model2.score(X_train, y_train))
print(model2.score(X_test, y_test))

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
# Define the dependend and independend variables
X = df[["Input_1", "Input_2"]]
y = df["Target"]

# Split your dataset into a train and test data set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8)

In [None]:
model3 = GridSearchCV(LogisticRegression(), param_grid = param_grid, cv = RepeatedKFold)
model3.fit(X_train, y_train)

In [None]:
print(model3.score(X_train, y_train))
print(model3.score(X_test, y_test))

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# Define the dependend and independend variables
X = df[["Input_1", "Input_2"]]
y = df["Target"]

# Split your dataset into a train and test data set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8)

In [None]:
model4 = KNeighborsClassifier(n_neighbors = 3, p = 1)
model4.fit(X_train, y_train)

In [None]:
print(model4.score(X_train, y_train))
print(model4.score(X_test, y_test))

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
#################! hier Parameter

In [None]:
# Define the dependend and independend variables
X = df[["Input_1", "Input_2"]]
y = df["Target"]

# Split your dataset into a train and test data set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8)

In [None]:
model5 = DecisionTreeClassifier()
model5.fit(X_train, y_train)

In [None]:
print(model5.score(X_train, y_train))
print(model5.score(X_test, y_test))

In [None]:
from sklearn.tree import plot_tree

plot_tree(model5, feature_names =)
plt.show

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
#################! hier Parameter

In [None]:
# Define the dependend and independend variables
X = df[["Input_1", "Input_2"]]
y = df["Target"]

# Split your dataset into a train and test data set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8)

In [None]:
model6 = RandomForestClassifier()
model6.fit(X_train, y_train)

In [None]:
print(model6.score(X_train, y_train))
print(model6.score(X_test, y_test))