# Data Science Cheat Sheet

Exploratory Data Analytics

1. Descriptive Analytics
2. Diagnostis Analytics
3. Predictive Analytics
4. Prescriptive Analytics

Categorical Variables:
    1. Nominal - Male/Female
    2. Ordinal - Bachelor/Master
    
Numerical Variables:
    1. Discrete - #of goals
    2. Continuous - height, weight

Statistics Subfields:

    1. Descriptive Statistics
    2. Inference
    3. Risk and Probability
    4. Correlation and Relationships
    5. Modeling

## Import Data

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [None]:
# df = pd.read_excel("file_name.xlsx")
# df = pd.read_stata("file_name.dta")
# df = pd.read_sas("file_name.sas7bdat")
# df = pd.read_hdf("file_name.h5")

In [None]:
data = pd.read_csv("file_name.csv")
df = data.copy()
df.head(10)

## EDA - Exploratory Data Analysis

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# Showing number of null values

df.isnull().sum()

In [None]:
# Visualizing null values
import seaborn as sns
plt.figure(figsize=(16, 8))
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='Dark2_r')

Show count of unique values for each column

In [None]:
for column in df.columns:
    print(column,'\t', df[column].nunique())

Checking Imbalance (for classification model target variable)

In [None]:
print(data.target.value_counts())

sns.countplot(data.iloc[:,-1], palette=['green', 'red'])
plt.title("[0] == Not Disease, [1] == Disease");

In [None]:
#Alternative for graph of imbalance for target
#below instead of"target" I will replace my own output name in dataframe
df.target.value_counts().plot(kind='bar', title='Count (target)')

Correlation of columns with eacht other using appropriate visuals

In [None]:
# subdata = df[['age','trestbps','chol','thalach','oldpeak', 'target']]
sns.pairplot(data = df, hue = 'target') #or choose data = subdata

Correlation of columns with each other (with numbers)

In [None]:
plt.subplots(figsize=(15,10))
sns.heatmap(df.corr(), annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))

## Data Cleaning

### Change column names to lower and rename

In [None]:
df.columns = df.columns.str.lower()

In [None]:
df.rename(columns = {'old_name':'new_name'}, inplace = True)

#or

df.columns = ['col1', 'col2', 'col3']

### Drop unimportant columns

In [None]:
df.drop('column_name',inplace=True,axis=1)

### Add new column 

In [None]:
df['age'] = df['this_year'] - df['birth_year']

### Checking duplicate values

In [None]:
df[df.duplicated()==True]

#### Drop duplicates

In [None]:
df.drop_duplicates(inplace=True)

### Impute Missing

In [None]:
df.Column_Name.fillna(df.Column_Name.mode()[0], inplace= True)

In [None]:
from sklearn.impute import SimpleImputer
df_num = df.select_dtypes('number')
df_col = df.select_dtypes('object')

imp_mean = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imp_mean_col = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')

df[df_num.columns] = imp_mean.fit_transform(df_num)
df[df_col.columns] = imp_mean_col.fit_transform(df_col)

Impute missing according to class

In [None]:
#boxplot with age on y-axis and Passenger class on x-axis.
#here we take approximate medians for each class
import seaborn as sns
import matplotlib.pyplot as plt  
plt.figure(figsize=(16, 8))
sns.set_style('darkgrid')
sns.boxplot(x='Pclass', y='Age', data=df, palette='Dark2').set_title('Age_distribition_acording_to_pclass')

In [None]:
#here in each if statement we put each median age according to each class
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):
        if Pclass == 1:
            return 37
        elif Pclass == 2:
            return 29
        else:
            return 24
    else:
        return Age

In [None]:
df['Age'] = df[['Age','Pclass']].apply(impute_age,axis=1)

If missing value is written as something other than null. for ex: " ?"

In [None]:
#it will return column names with count of them
df.isin([" ?"]).sum()

In [None]:
df['columnname'].replace(' ?', df.columnname.mode()[0], inplace = True)

### Eliminating spaces or (, ) like symbols

In [None]:
df= list(map(lambda x: x.strip(), df.column_name))
df= list(map(lambda x: x.strip('('), df.column_name))
df= list(map(lambda x: x.strip(')'), df.column_name))
df= list(map(lambda x: x.strip("'"), df.column_name))

### Group same variables with different names (China-Mainland China)

In [None]:
df.country = list(map(lambda x: 'china' if x == "mainland china" else x, df.country))

### Outliers

In [None]:
#check for numeric columns
plt.boxplot(columnname) 
fig = plt.figure(figsize =(10, 7))
plt.show()

In [None]:
#detecting outliers with Z scores for numeric columns
import numpy as np
import pandas as pd

def detect_outlier(columnname):
    
    outliers=[]    
    threshold=3
    mean_1 = np.mean(columnname)
    std_1 =np.std(columnname)
    
    
    for y in columnname:
        z_score= (y - mean_1)/std_1 
        if np.abs(z_score) > threshold:
            outliers.append(y)
    return outliers

In [None]:
#deleting outlier in a spesific column
sample_outlier = detect_outlier(columnname)
for i in sample_outliers:
    columnname = np.delete(sample, np.where(sample==i))

## Feature Engineering 

### Merging two dataframes

In [None]:
store = furniture.merge(office, how='inner', on='Order Date')

### Convert categorical variable into dummy/indicator variables

In [None]:
pd.get_dummies(data, prefix_sep='_', columns = ['Gender'], drop_first=True)

### Binarize Data (for ratio using threshold ex: age > 60 then 1)

In [None]:
age = df.iloc[:, 1]
from sklearn.preprocessing import Binarizer
x = age.values.reshape(1, -1)

# For age, let threshold be 35 
binarizer = Binarizer(35)

# Transformed feature 
Binarized_age = binarizer.fit_transform(x)
df ['Bin.Age>35'] = Binarized_age[0]

### One - hot Encoding (for Categorical Nominal data ex: weekdays)

In [None]:
#creates seperate columns for each row and gives 0 or 1 accordingly

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(categories='auto'), [0])],   # The column numbers to be transformed (here is [0] but can be [0, 1, 3])
    remainder='passthrough'                                         # Leave the rest of the columns untouched
)

x = ct.fit_transform(x)
x

### Label Encoder (for Ordinal data ex: education level)

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [None]:
df.columns

In [None]:
df_cat = [ 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']
for col in df_cat:
    df[col] = label_encoder.fit_transform(df[col])

## Scaling

In [None]:
# StandardScaler results in a distribution with a standard deviation equal to 1.
# StandardScaler makes the mean of the distribution approximately 0.
# Use StandardScaler if you want each feature to have zero-mean, unit standard-deviation.
# If you want more normally distributed data, and are okay with transforming your data.

from sklearn.preprocessing import StandardScaler
x = StandardScaler().fit_transform(x)

In [None]:
#MinMaxScaler preserves the shape of the original distribution.
#MinMaxScaler doesn’t reduce the importance of outliers.
#Default range for the feature returned by MinMaxScaler is 0 to 1.

from sklearn.preprocessing import MinMaxScaler
x = MinMaxScaler().fit_transform(x)

In [None]:
# RobustScaler does not scale the data into a predetermined interval like MinMaxScaler.
# Use RobustScaler if you want to reduce the effects of outliers, relative to MinMaxScaler.

from sklearn.preprocessing import RobustScaler
x = RobustScaler().fit_transform(x)

# Machine Learning

- Supervised Learning:
    - Regressions:
        - Linear Regression
        - Support Vector Regressor
        - Decision Trees and Random Forests
        
    - Classifiers:
        - Logistic Regression
        - Naive Bayes
        - Support Vector Machines
        - KNN: K-Nearest Neighbors
        - Decision Trees
        - Random Forests
        - Neural Networks/Deep Learning

- Unsupervised Learning:
    - Clustering:
        - K-Means

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import metrics

In [None]:
X = df.iloc[:,:-1] #.values.reshape(-1,1) - for single x linear regression
#X= df.drop("Price", axis = 1)

y = df.iloc[:,-1] #.values.reshape(-1,1) - for single linear regression
#y= df['Price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 123)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train) #training the algorithm

In [None]:
#Printing model coefficient and intercept
#y = kx+b ----> b:
print('Intercept: ', model.intercept_)
#y = kx+b ----> k:
coeff_df = pd.DataFrame(model.coef_, X.columns, columns=['Coefficient'])  
coeff_df

In [None]:
y_pred = model.predict(X_test)
#print('predicted response:', y_pred, sep='\n')

Comparing Actual and Predicted

In [None]:
comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
comparison.head()

Visual Comparison of Actual and Predicted

In [None]:
compared = Comparison.head(25)
compared.plot(kind='bar',figsize=(16,10))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

X_test, y_test Scatter plot on X_test, y_pred regression line

In [None]:
plt.scatter(X_test, y_test,  color='gray')
plt.plot(X_test, y_pred, color='red', linewidth=2)
plt.show()

Evaluation of Linear Regression Model

In [None]:
R2 = metrics.r2_score(y_test, y_pred) #coefficient of determination
print('R^2:' , R2)
n = X_test.shape[0] #sample size
p = X_test.shape[1] #number of predictors
print('Adjusted R^2 :' , 1-(1-R2)*(n-1)/(n-p-1))
print('Mean Absolute Error:' , metrics.mean_absolute_error(y_test, y_pred)) 
print('Mean Squared Error:' , metrics.mean_squared_error(y_test, y_pred)) 
print('Root Mean Squared Error:' , np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

### Assumptions for Linear Models

In [None]:
#"Linearity" with numeric columns
p = sns.pairplot(ad_data, x_vars=['input columns'], y_vars='output column', size=7, aspect=0.7)

In [None]:
#"Mean Of Residuals". Should be close to 0.
residuals = y_train.values-y_pred
mean_residuals = np.mean(residuals)
print("Mean of Residuals {}".format(mean_residuals))

In [None]:
#the "Homoscedasticity". 
p = sns.scatterplot(y_pred,residuals)
plt.xlabel('y_pred/predicted values')
plt.ylabel('Residuals')
plt.ylim(-10,10)
plt.xlim(0,26)
p = sns.lineplot([0,26],[0,0],color='blue')
p = plt.title('Residuals vs fitted values plot for homoscedasticity check')

#on the new line - p value should be more than 0.05
import statsmodels.stats.api as sms
from statsmodels.compat import lzip
name = ['F statistic', 'p-value']
test = sms.het_goldfeldquandt(residuals, X_train)
lzip(name, test)

In [None]:
#"Normality of Residuals".
p = sns.distplot(residuals,kde=True)
p = plt.title('Normality of error terms/residuals')

In [None]:
#Checking "Autocorrelation".
plt.figure(figsize=(10,5))
p = sns.lineplot(y_pred,residuals,marker='o',color='blue')
plt.xlabel('y_pred/predicted values')
plt.ylabel('Residuals')
plt.ylim(-10,10)
plt.xlim(0,26)
p = sns.lineplot([0,26],[0,0],color='red')
p = plt.title('Residuals vs fitted values plot for autocorrelation check')

#on the new line - p value should be less than 0.05
from statsmodels.stats import diagnostic as diag
min(diag.acorr_ljungbox(residuals , lags = 40)[1])

## Logistic Regression - Classification Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [None]:
X = df.iloc[:,:-1] #.values.reshape(-1,1) - for single x linear regression
#X= df.drop("Price", axis = 1)

y = df.iloc[:,-1] #.values.reshape(-1,1) - for single linear regression
#y= df['Price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 123)

In [None]:
model = LogisticRegression(max_iter = 100000)
model.fit(X_train, y_train) #train the model
y_pred = model.predict(X_test)

Comparing Actual and Prediction

In [None]:
comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
comparison.head()

Evaluation of Logistic Regression Classification Model

In [None]:
print(confusion_matrix(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
roc_auc_score(y_test, y_pred)

In [None]:
logit_roc_auc = roc_auc_score(y_test, model.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
plt.figure(figsize=(16,8))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Survived or Not')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

## Naive Bayes - Classification Model

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score

In [None]:
X = df.iloc[:,:-1] #.values.reshape(-1,1) - for single x linear regression
#X= df.drop("Price", axis = 1)

y = df.iloc[:,-1] #.values.reshape(-1,1) - for single linear regression
#y= df['Price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 123)

In [None]:
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

Comparing actual and prediction

In [None]:
comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
comparison.head()

Evaluation of Naive Bayes Classification Model

In [None]:
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix for Naive Bayes:''\n', cm)

In [None]:
cr = classification_report(y_test, y_pred)
print('Classification Report for Naive Bayes:''\n''\n', cr)

In [None]:
auc = roc_auc_score(y_test, y_pred)
print('Auc Score for Naive Bayes:', auc)

In [None]:
NB_roc_auc = roc_auc_score(y_test, model.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
plt.figure(figsize=(10,5))
plt.plot(fpr, tpr, label='Naive Bayes (area = %0.2f)' % NB_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Survived or Not')
plt.legend(loc="lower right")
plt.savefig('NB_ROC')
plt.show()

## SVM - Classification Model

In [None]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score

In [None]:
X = df.iloc[:,:-1] #.values.reshape(-1,1) - for single x linear regression
#X= df.drop("Price", axis = 1)

y = df.iloc[:,-1] #.values.reshape(-1,1) - for single linear regression
#y= df['Price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 123)

In [None]:
'''
kernel takes low dimensional input space and transforms it to a higher dimensional space. So, converts “not separable” 
problem to “separable problem” by adding more dimension to it.
'''

#"Linear" kernel
model = svm.SVC(kernel = 'linear', probability=True)

#“Polynomial” kernel is better for curved or nonlinear input space.
#model = svm.SVC(kernel = 'poly', degree=2)

#“Radial basis function” kernel can map an input space in infinite dimensional space.
#model = svm.SVC(kernel = 'rbf', C=1, gamma=0.1)

In [None]:
#Train the model using the training sets
model = model.fit(X_train, y_train)
#Predict the response for test dataset
y_pred = model.predict(X_test)

Comparing Prediction and Actual

In [None]:
comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
comparison

Evaluation of SVM Classification Model

In [None]:
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix for SVM:''\n', cm)

In [None]:
cr = classification_report(y_test, y_pred)
print("Classification Report for SVM:""\n", cr)

In [None]:
auc = roc_auc_score(y_test, y_pred)
print('Auc Score for svm:', auc)

In [None]:
SVM_roc_auc = roc_auc_score(y_test, model.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
plt.figure(figsize=(16,8))
plt.plot(fpr, tpr, label='SVM (area = %02f)' % SVM_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Survived or Not')
plt.legend(loc="lower right")
plt.savefig('SVM_ROC')
plt.show()

## Decision Trees - Classification Model

In [None]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score

In [None]:
X = df.iloc[:,:-1] #.values.reshape(-1,1) - for single x linear regression
#X= df.drop("Price", axis = 1)

y = df.iloc[:,-1] #.values.reshape(-1,1) - for single linear regression
#y= df['Price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 123)

In [None]:
model = DecisionTreeClassifier()
#model = DecisionTreeClassifier(criterion="entropy")
model = model.fit(X_train, y_train)
y_pred = model.predict(X_test)

Evaluation of Decision Tree Classification Model

In [None]:
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix for Decision Tree:''\n', cm)

In [None]:
cr = classification_report(y_test, y_pred)
print("Classification Report for Decision Tree:""\n""\n", cr)

In [None]:
auc = roc_auc_score(y_test, y_pred)
print('Auc Score for Decision Tree:', auc)

In [None]:
DT_roc_auc = roc_auc_score(y_test, model.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
plt.figure(figsize = (16, 8))
plt.plot(fpr, tpr, label='SVM (area = %02f)' % DT_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Survived or Not')
plt.legend(loc="lower right")
plt.savefig('Decision_Tree_ROC')
plt.show()

## Random Forest - Classification Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score

In [None]:
X = df.iloc[:,:-1] #.values.reshape(-1,1) - for single x linear regression
#X= df.drop("Price", axis = 1)

y = df.iloc[:,-1] #.values.reshape(-1,1) - for single linear regression
#y= df['Price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 123)

In [None]:
model = RandomForestClassifier(n_estimators = 10, random_state = None)
model = model.fit(X_train, y_train)
y_pred = model.predict(X_test)

Evaluation of Random Forest Classifcation Model

In [None]:
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix for SVM:''\n', cm)

In [None]:
cr = classification_report(y_test, y_pred)
print("Classification Report for SVM:""\n""\n", cr)

In [None]:
auc = roc_auc_score(y_test, y_pred)
print('Auc Score for svm:', auc)

In [None]:
RF_roc_auc = roc_auc_score(y_test, model.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
plt.figure(figsize = (16, 8))
plt.plot(fpr, tpr, label = 'RF (area = %02f)' % RF_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Survived or Not')
plt.legend(loc="lower right")
plt.savefig('RF_ROC')
plt.show()

## KNN - Classification Model

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score

In [None]:
X = df.iloc[:,:-1] #.values.reshape(-1,1) - for single x linear regression
#X= df.drop("Price", axis = 1)

y = df.iloc[:,-1] #.values.reshape(-1,1) - for single linear regression
#y= df['Price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 123)

In [None]:
model = KNeighborsClassifier(n_neighbors=5)
model = model.fit(X_train, y_train)
y_pred = model.predict(X_test)

Evaluation of KNN Classification Model

In [None]:
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix ''\n', cm)

In [None]:
cr = classification_report(y_test, y_pred)
print("Classification Report for KNN:""\n""\n", cr)

In [None]:
auc = roc_auc_score(y_test, y_pred)
print('Auc Score for KNN:', auc)

In [None]:
KNN_roc_auc = roc_auc_score(y_test, model.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:,1])
plt.figure(figsize = (16, 8))
plt.plot(fpr, tpr, label = 'KNN (area = %02f)' % KNN_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Survived or Not')
plt.legend(loc="lower right")
plt.savefig('KNN_ROC')
plt.show()

## Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = model, X = X_train, y = y_train, cv = 10)

print('Mean accuracy CV:',accuracies.mean())
print('Standard deviation of CV:',accuracies.std())

## Ridge and Lasso

In [None]:
from sklearn.linear_model import Ridge
ridge_l = [0,1,10,100]

In [None]:
for i, l in enumerate(ridge_l):
    ridge = Ridge(alpha=l)
    ridge.fit(x, y)

    ridge_coeff = ridge.coef_
    ridge_intercept = ridge.intercept_

In [None]:
from sklearn.linear_model import Lasso
lasso_l = [0,0.5,1,2,4]

In [None]:
for i, l in enumerate(lasso_l):
    lasso = Lasso(alpha=l)
    lasso.fit(x, y)

    lasso_coeff = lasso.coef_
    lasso_intercept = lasso.intercept_

## Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbm = GradientBoostingClassifier(random_state = 123)

gbm.fit(x_train, y_train)

gbm_pred = gbm.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, gbm_pred)

## XGBoost Classifier

In [None]:
#if you need for regression problem you just change XGBClassifier to XGBRegressor
pip install xgboost

import xgboost
print(xgboost.__version__)

from xgboost import XGBClassifier

In [None]:
xgbt = XGBClassifier(max_depth = 2,
             learning_rate = 0.2,
             objective  = "multi:softmax",
             num_class = 2,
             booster = "gbtree",
             n_estimarors = 10,
             random_state = 123)

In [None]:
xgbt.fit(X_train, y_train)

xgbt_pred = xgbt.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, xgbt_pred)

In [None]:
xgbt.score(X_train, y_train)

In [None]:
xgbt.score(X_test, y_test)

# CatBoost

In [None]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(iterations=2,
                          learning_rate=1,
                          depth=2)

model.fit(X_train, y_train)

preds = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

model.score(X_train, y_train)

model.score(X_test, y_test)

# Hyperparameter Tuning

## Hand C

In [None]:
from sklearn.svm import SVC
svc = SVC(random_state=1234, kernel="rbf",C=1)
svc.fit(X_train, y_train)
y_pred_1 = svc.predict(X_test)

from sklearn import metrics

print("Accuracy:",metrics.accuracy_score(y_test, y_pred_1))

In [None]:
svc = SVC(random_state=1234, kernel="poly", C = 1 )
svc.fit(X_train, y_train)
y_pred_2 = svc.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred_1))

## GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

Random Forest

In [None]:
rfc_param = {'n_estimators':[10,15,20], 
            'min_samples_split':[8,16],
            'min_samples_leaf':[1,2,3,4,5]
            }

In [None]:
rfc_grid = GridSearchCV(estimator=rfc, 
                        param_grid=rfc_param,
                        scoring='accuracy',
                        cv=10,
                        n_jobs=-1,
                        return_train_score=True)

In [None]:
rfc_grid_fit = rfc_grid.fit(x,y)

cv_results_rfc = pd.DataFrame.from_dict(rfc_grid_fit.cv_results_)

Logistic Regression

In [None]:
lrc_param = {'C':[0.01, 0.1, 0.5, 1, 2, 5, 10], 
            'penalty':['l2'],
            'solver':['liblinear','lbfgs', 'saga']
            }

In [None]:
lrc_grid = GridSearchCV(estimator=lrc, 
                        param_grid=lrc_param,
                        scoring='accuracy',
                        cv=10,
                        n_jobs=-1,
                        return_train_score=True,
                        verbose=True)

In [None]:
lrc_grid_fit = lrc_grid.fit(x, y)

cv_results_lrc = pd.DataFrame.from_dict(lrc_grid_fit.cv_results_)

Support Vector Classifier

In [None]:
svc_param = {'C':[0.01, 0.1, 0.5, 1, 2, 5, 10], 
            'kernel':['rbf', 'linear'],
            'gamma':[0.1, 0.25, 0.5, 1, 5]
            }

In [None]:
svc_grid = GridSearchCV(estimator=svc, 
                        param_grid=svc_param,
                        scoring='accuracy',
                        cv=10,
                        n_jobs=-1,
                        return_train_score=True)

In [None]:
svc_grid_fit = svc_grid.fit(x, y)

cv_results_svc = pd.DataFrame.from_dict(svc_grid_fit.cv_results_)

In [None]:
# Get the top ranked test score for all the three classifiers

rfc_top_rank = cv_results_rfc[cv_results_rfc['rank_test_score'] == 1]
lrc_top_rank = cv_results_lrc[cv_results_lrc['rank_test_score'] == 1]
svc_top_rank = cv_results_svc[cv_results_svc['rank_test_score'] == 1]

## RandomizedSearchCV 

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=1234)

from sklearn.model_selection import RandomizedSearchCV

In [None]:
rfc_param = {'n_estimators':[10,15,20], 
            'min_samples_split':[8,16],
            'min_samples_leaf':[1,2,3,4,5]
            }

In [None]:
rfc_rs = RandomizedSearchCV(estimator=rfc, 
                        param_distributions=rfc_param,
                        scoring='accuracy',
                        cv=10,
                        n_iter=10,
                        return_train_score=True,
                        random_state=1234)

In [None]:
rfc_rs_fit = rfc_rs.fit(x, y)

cv_results_rfc_rs = pd.DataFrame.from_dict(rfc_rs_fit.cv_results_)

In [None]:
print('\n The best Parameters are : ')
print(rfc_rs_fit.best_params_)

## BayesSearchCV

In [None]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

opt = BayesSearchCV(
    SVC(),
    {
        'C': Real(1e-6, 1e+6, prior='log-uniform'),
        'gamma': Real(1e-6, 1e+1, prior='log-uniform'),
        'degree': Integer(1,8),
        'kernel': Categorical(['linear', 'poly', 'rbf']),
    },
    n_iter=32,
    random_state=0
)

_ = opt.fit(x_train, y_train)


print(opt.score(X_test, y_test))

In [None]:
from sklearn.metrics import classification_report

# Unsupervised Learning 

## PCA 

In [None]:
dataset = pd.read_csv('Wine.csv')
X = dataset.iloc[:, 0:13].values
y = dataset.iloc[:, 13].values

In [None]:
y.shape

In [None]:
X.shape

In [None]:
sc = StandardScaler()
X = sc.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
pca = PCA(n_components = 2)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_

In [None]:
explained_variance

In [None]:
X_train

In [None]:
X_test

In [None]:
# Fitting Logistic Regression to the Training set

classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [None]:
# Making the Confusion Matrix

cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
y_pred

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
# Visualising the Training set results

X_set, y_set = X_train, y_train
#minimum və maximum dəyərlərin köməyi ilə x1 və x2 (PCA1 və (PCA2) oxlarının aralığı təyin olunur)
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green', 'blue')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green', 'blue'))(i), label = j)
plt.title('Logistic Regression (Training set)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend()
plt.show()


In [None]:
# Visualising the Test set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green', 'blue')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green', 'blue'))(i), label = j)
plt.title('Logistic Regression (Test set)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend()
plt.show()

# Time Series Analysis

## FBprophet 

In [None]:
df = pd.read_csv("/content/wiki_machine_learning.csv", sep = " ")
df.head()

In [None]:
#minimum and maximum dates
df['date'].min(), df['date'].max()

In [None]:
#removing columns we don't need and check if there is any missing value
cols = ['lang', 'page', 'rank', 'month', 'title']
df.drop(cols, axis=1, inplace=True)
df = df.sort_values('date')
df.isnull().sum()

In [None]:
#to fill null calues
df.fillna(method=‘backfill’,inplace=True)

In [None]:
#groupby index date
df = df.groupby('date')['count'].sum().reset_index()

In [None]:
df = df.set_index('date')
df.index

In [None]:
y = df['count']
y.plot(figsize=(15, 6))
plt.show()

In [None]:
df = df.reset_index()

In [None]:
df.shape

In [None]:
#craeating prophet model
from fbprophet import Prophet
df = df.rename(columns={'date': 'ds', 'count': 'y'})
df_model = Prophet(interval_width=0.95)
df_model.fit(df)
df_forecast = df_model.make_future_dataframe(periods=30, freq='D')
df_forecast = df_model.predict(df_forecast)

#for month - 'month' or 'M', hour - 'H', day - 'D',

In [None]:
#to show the plot
plt.figure(figsize=(300, 500))
df_model.plot(df_forecast, xlabel = 'date', ylabel = 'count')
plt.title('Count for Next 30');

In [None]:
df_model.plot_components(df_forecast);

## Neural Prophet 

In [None]:
from neuralprophet import NeuralProphet

In [None]:
df = pd.read_excel("Superstore.xls")
furniture = df.loc[df['Category'] == 'Furniture']

In [None]:
#removing columns we don't need and check if there is any missing value
cols = ['Row ID', 'Order ID', 'Ship Date', 'Ship Mode', 'Customer ID', 'Customer Name', 'Segment', 'Country', 'City', 'State', 'Postal Code', 'Region', 'Product ID', 'Category', 'Sub-Category', 'Product Name', 'Quantity', 'Discount', 'Profit']
furniture.drop(cols, axis=1, inplace=True)
furniture = furniture.sort_values('Order Date')
furniture.isnull().sum()

In [None]:
#groupby index date
furniture = furniture.groupby('Order Date')['Sales'].sum().reset_index()

In [None]:
furniture = furniture.set_index('Order Date')

In [None]:
#we will use the averages daily sales value for that month instead, and we are using the start of each month as the timestamp.
y = furniture['Sales'].resample('MS').mean()

In [None]:
#rename column names as neural prophet expect time-series as ds and value to predict as y
y= y.reset_index()
y.rename(columns={"Order Date": "ds", "Sales": "y"}, inplace=True)

In [None]:
y.head()

In [None]:
y.plot(x='ds',y='y',figsize=(15, 6))
plt.show()

In [None]:
# model = NeuralProphet() if you're using default variables below.
model = NeuralProphet(
    growth="linear",  # Determine trend types: 'linear', 'discontinuous', 'off'
    changepoints=None, # list of dates that may include change points (None -> automatic )
    n_changepoints=5,
    changepoints_range=0.8,
    trend_reg=0,
    trend_reg_threshold=False,
    yearly_seasonality="auto",
    weekly_seasonality="auto",
    daily_seasonality="auto",
    seasonality_mode="additive",
    seasonality_reg=0,
    n_forecasts=1,
    n_lags=0,
    num_hidden_layers=0,
    d_hidden=None,     # Dimension of hidden layers of AR-Net
    ar_sparsity=None,  # Sparcity in the AR coefficients
    learning_rate=None,
    epochs=40,
    loss_func="Huber",
    normalize="auto",  # Type of normalization ('minmax', 'standardize', 'soft', 'off')
    impute_missing=True
)

In [None]:
metrics = model.fit(y, validate_each_epoch=True, freq="MS") 
future = model.make_future_dataframe(y, periods=10, n_historic_predictions=len(y)) 
forecast = model.predict(future)

In [None]:
#plotting predictions
fig, ax = plt.subplots(figsize=(14, 10)) 
model.plot(forecast, xlabel="Date", ylabel="Sales", ax=ax);

In [None]:
#plotting components of model
model.plot_parameters()

## Pipeline / Pickle

In [None]:
import io
from google.colab import files
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [None]:
data=pd.read_csv('/content/income_evaluation.csv')
df=data.copy()

In [None]:
df

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [None]:
df.columns=df.columns.str.replace(' ','')

In [None]:
df.drop('education',axis=1,inplace=True)
df.columns

In [None]:
x=df.drop("income",axis=1)
y=df['income']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
numeric_columns=x.select_dtypes(include="number").columns
categorical_columns=x.select_dtypes(include="object").columns

In [None]:
numeric_columns

In [None]:
numerical_Pipeline = Pipeline(steps=[( 'scaler', StandardScaler() ) ])

In [None]:
categorical_Pipeline = Pipeline(steps=[('encoder1', OneHotEncoder(handle_unknown='ignore'))])
                                       #('encoder2', LabelEncoder() ) ])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('categorical', categorical_Pipeline, categorical_columns),
        ('numerical', numerical_Pipeline, numeric_columns)
    ])


In [None]:
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

In [None]:
pipe.fit(x_train)

In [None]:
x_train = pipe.transform(x_train)
x_test = pipe.transform(x_test)

In [None]:
import xgboost as xgb
model=xgb.XGBClassifier(random_state=1,learning_rate=0.01)
model=model.fit(x_train,y_train)
y_pred=model.predict(x_test)
y_pred

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix
print("Confusion Matrix \n",confusion_matrix(y_test,y_pred))
accuracy_score(y_test,y_pred)

In [None]:
import pickle
document="myModel"
pickle.dump(model,open(document,"wb"))
loaded_model=pickle.load(open(document,'rb'))
y_loded_model_pred=loaded_model.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix
print("Confusion Matrix \n\n",confusion_matrix(y_test,y_loded_model_pred),"\n")
print("Accuary Score" , '= %0.2f' % accuracy_score(y_test,y_loded_model_pred),"%")

# Advanced Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

Scatter Plot

In [None]:
plt.figure(figsize=(16,8))
df.plot(x='MinTemp', y='MaxTemp', style='+')  
plt.title('Scatter Plot of ')  
plt.xlabel('X Label')  
plt.ylabel('Y Label')  
plt.show()

Distribution Plot

In [None]:
plt.figure(figsize=(16,8))
sns.histplot(df['numeric_column'].dropna(), kde = True, bins = 30).set_title('Distribition of numeric_column')

Distribution of Target variable according to class (for classification)

In [None]:
plt.figure(figsize=(16,8))
sns.set_style('darkgrid')
sns.countplot(x='target', hue='class', data=df,palette='Dark2').set_title('Survived_by_Pclass')

Boxplot

In [None]:
#boxplot with age on y-axis and Passenger class on x-axis There will be as many boxplots as number of classes
plt.figure(figsize=(16, 8))
sns.set_style('darkgrid')
sns.boxplot(x='Pclass', y='Age', data=df, palette='Dark2').set_title('Age_distribition_acording_to_pclass')