<a href="https://colab.research.google.com/github/VitorGit93/Pesquisa_Evasao/blob/main/Recursos/Notebooks%20de%20exemplo/ml_algorithms_usage_and_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Machine Learning is a vast topic and as the name says, it can be defined as teaching our machines certain things, certain processes so that machine can behave correctly as per the knowledge it gains from learning.

#### There are many algorithms for Machine Learning and we will see each one's usage in this notebook.

#### We use this student dataset about students and apply these algorithms to find out which one gives better performance for this particular dataset.

_______________________________________________________

### Importing the libraries

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

import seaborn as sns

from sklearn.decomposition import PCA

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

import pickle

import warnings
warnings.filterwarnings('ignore')

### Reading the dataset from CSV file

In [None]:
student = pd.read_csv('/kaggle/input/higher-education-predictors-of-student-retention/dataset.csv')

#### Once this data is in the DataFrame, we will perform following steps:
1. Understand the data
2. Pre-processing, Exploratory Data Analysis (EDA), Feature Selection
3. Extract Input and Output columns
4. Scaling the values
5. Splitting the data into Training & Testing Data
6. Training the model
7. Evaluate the model & Select the Model
8. Model Tuning
9. Model Deployement

## **Understanding the data**

In [None]:
# check the shape of the dataset in student DataFrame
student.shape

In [None]:
# See which are the 35 columns
student.columns

In [None]:
# How the data looks
student.sample(4)


In [None]:
student.head(5)

In [None]:
# Check info about all the columns
student.info()

## **Pre-processing**

### Looks like there are no nulls or duplicates but still we can check and handle if required.

In [None]:
print(student.isnull().sum())

In [None]:
print(student.duplicated().sum())

#### Only **Target** column is non-numeric which we can convert to numeric
#### Note that **Target** column is a output column so we need it in numeric form so that we can find it's correlation with others

In [None]:
student['Target'].unique()

### So there are 3 unique values in target column which we can replace by
* Dropout -> 0
* Enrolled -> 1
* Graduate -> 2

In [None]:
student['Target'] = student['Target'].map({
    'Dropout':0,
    'Enrolled':1,
    'Graduate':2
})

In [None]:
# Check Target column, it must have filled with 0, 1 & 2
student

In [None]:
student.dtypes
# Target column is integer now

In [None]:
# Learn the data mathematically
student.describe()

### Finally find the correlation of Target with all other numeric columns

In [None]:
student.corr()['Target']

In [None]:
fig = px.imshow(student)
fig.show()

### Looking at the corelation, we need to select the required columns for prediction.

In [None]:
# This is the new Df considering relevant input and output columns
student_df = student.iloc[:,[1,11,13,14,15,16,17,20,22,23,26,28,29,34]]

In [None]:
student_df.head()

In [None]:
student_df.info()

In [None]:

sns.heatmap(student_df)

## **EDA -  We will perform Exploratory Data Analysis on student_df**

In [None]:
# How many dropouts, enrolled & graduates are there in Target column
student_df['Target'].value_counts()

In [None]:
# Plot the above values
x = student_df['Target'].value_counts().index
y = student_df['Target'].value_counts().values

df = pd.DataFrame({
    'Target': x,
    'Count_T' : y
})

fig = px.pie(df,
             names ='Target',
             values ='Count_T',
            title='How many dropouts, enrolled & graduates are there in Target column')

fig.update_traces(labels=['Graduate','Dropout','Enrolled'], hole=0.4,textinfo='value+label', pull=[0,0.2,0.1])
fig.show()

In [None]:
# Now see the corelation of Target with the rest
student_df.corr()['Target']

### Let's plot the column ***Curricular units 2nd sem (approved)*** againt ***Curricular units 1st sem (approved)*** and differentiate ***Target*** by color

In [None]:
fig = px.scatter(student_df,
             x = 'Curricular units 1st sem (approved)',
             y = 'Curricular units 2nd sem (approved)',
             color = 'Target')
fig.show()

### Let's plot the column ***Curricular units 1st sem (grade)*** againt ***Curricular units 1st sem (grade)*** and differentiate ***Target*** by color

In [None]:
fig = px.scatter(student_df,
             x = 'Curricular units 1st sem (grade)',
             y = 'Curricular units 2nd sem (grade)',
             color = 'Target')
fig.show()

In [None]:
fig = px.scatter(student_df,
             x = 'Curricular units 1st sem (enrolled)',
             y = 'Curricular units 2nd sem (enrolled)',
             color = 'Target')
fig.show()

In [None]:
fig = px.box(student_df, y='Age at enrollment')
fig.show()

In [None]:
# Distribution of age of students at the time of enrollment
sns.histplot(data=student_df['Age at enrollment'], kde=True)

In [None]:
# Let's try plotly histogram for interactive figure
px.histogram(student_df['Age at enrollment'], x='Age at enrollment',color_discrete_sequence=['red'])

## Extract Input & Output Columns

In [None]:
X = student_df.iloc[:,0:13]
y = student_df.iloc[:,-1]
X

## **Splitting the data into Training & Testing Data**

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

## **Training the model**

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

# Without Scaling
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

### SGD - Stochastic Gradient Descent

In [None]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(max_iter=1000, tol=1e-3)

# Without Scaling
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

### Perceptron

In [None]:
from sklearn.linear_model import Perceptron
# this is same as SGDClassifier(loss="perceptron", eta0=1, learning_rate="constant", penalty=None)

clf = Perceptron(tol=1e-3, random_state=0)
# Without Scaling
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

### Logistic Regression CV

In [None]:
from sklearn.linear_model import LogisticRegressionCV
clf = LogisticRegressionCV(cv=5, random_state=0)

# Without Scaling
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

### Decision Tree Classifier

In [None]:
# Using DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)

#without scaling
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=10, random_state=0)

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

### Suppport Vector Machines

In [None]:
from sklearn.svm import SVC
#clf = SVC(gamma='auto')

svc = SVC()
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
clf = GridSearchCV(svc, parameters)

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())


### NuSVC

In [None]:
from sklearn.svm import NuSVC
clf = NuSVC()

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

### Linear SVC

In [None]:
from sklearn.svm import LinearSVC
clf = LinearSVC(random_state=0, tol=1e-5)

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()

#y_pred = gnb.fit(X_train, y_train).predict(X_test)
#print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

In [None]:
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

In [None]:
from sklearn.naive_bayes import CategoricalNB
clf = CategoricalNB()

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and without CV: ",accuracy_score(y_test,y_pred))

### K Nearest Neighbours

In [None]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=3)

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without Scaling and without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("Without Scaling and With CV: ",scores.mean())

## Model Selection

#### Select the model which gives maximum accuracy. So we select Random Forest with accuracy 76.94 & 77.08 with Cross Validation

In [None]:
clf = RandomForestClassifier(max_depth=10, random_state=0)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)

print("With CV: ",scores.mean())
print("Precision Score: ", precision_score(y_test, y_pred,average='macro'))
print("Recall Score: ", recall_score(y_test, y_pred,average='macro'))
print("F1 Score: ", f1_score(y_test, y_pred,average='macro'))

In [None]:
param_grid = {
    'bootstrap': [False,True],
    'max_depth': [5,8,10, 20],
    'max_features': [3, 4, 5, None],
    'min_samples_split': [2, 10, 12],
    'n_estimators': [100, 200, 300]
}

rfc = RandomForestClassifier()

clf = GridSearchCV(estimator = rfc, param_grid = param_grid, cv = 5, n_jobs = -1, verbose = 1)

clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,y_pred))
print(clf.best_params_)
print(clf.best_estimator_)

In [None]:
clf = RandomForestClassifier(bootstrap=False, max_depth=10,max_features=3,
                             min_samples_split=12,
                             n_estimators=100, random_state=0)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Without CV: ",accuracy_score(y_test,y_pred))
scores = cross_val_score(clf, X_train, y_train, cv=10)
print("With CV: ",scores.mean())

print("Precision Score: ", precision_score(y_test, y_pred,average='micro'))
print("Recall Score: ", recall_score(y_test, y_pred,average='micro'))
print("F1 Score: ", f1_score(y_test, y_pred,average='micro'))