#### Objective of this kernel:
* to further explore the data
* To clean the out put the Dataset to be fed into the various machine algorithms

### Importing our libraries 

In [57]:
import numpy as np
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns


# Input data files are available in the "../output/" directory.
import os
for dirname, _, filenames in os.walk('./output/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# Any results you write to the current directory are saved as output.

./output/linearmodel.pickle
./output/KNNmodel.pickle
./output/processed_data.csv
./output/cleaned_data.csv
./output/Logregmodel.pickle
./output/SVMmodel.pickle
./output/.ipynb_checkpoints/cleaned_data-checkpoint.csv


## Reading the data set

In [50]:
df = pd.read_csv('./output/cleaned_data.csv')
df.head()

Unnamed: 0,Age,Gender,wassce grade,level,current CGPA,access to a laptop or internet,study group,time spent on independent studies
0,23 - 27 years,Male,16 - 20,L 400,2.9 - 2.5,Yes,Yes,More than 2 hours
1,23 - 27 years,Male,6 - 9,L 400,1.9 - 1.5,Yes,Yes,More than 2 hours
2,23 - 27 years,Male,10 - 15,L 400,3.5 - 3.0,Yes,Yes,Less than 2 hours
3,23 - 27 years,Female,10 - 15,L 400,2.9 - 2.5,Yes,Yes,More than 2 hours
4,23 - 27 years,Female,10 - 15,L 400,2.9 - 2.5,Yes,Yes,More than 2 hours


## we are going to transform all our categorical data to proper format to be fed to our machine learning algorithms
* We preprocess our data with the LabelEncoder from sklearn

#### Preprocessing our independent variable x and our dependent variable y

In [6]:
from sklearn.preprocessing import LabelEncoder
label_encoder_x=LabelEncoder()
#preprocessing x
Age=label_encoder_x.fit_transform(list(df["Age"]))
Gender=label_encoder_x.fit_transform(list(df["Gender"]))
wassce_grade=label_encoder_x.fit_transform(list(df["wassce grade"]))
level=label_encoder_x.fit_transform(list(df["level"]))
access_to_a_laptop_or_internet=label_encoder_x.fit_transform(list(df["access to a laptop or internet"]))
study_group=label_encoder_x.fit_transform(list(df["study group"]))
time_spent_on_independent_studies=label_encoder_x.fit_transform(list(df["time spent on independent studies"]))

current_CGPA=label_encoder_x.fit_transform(list(df["current CGPA"]))


x=list(zip(Age,Gender,wassce_grade,access_to_a_laptop_or_internet,study_group,time_spent_on_independent_studies,))
y=list(current_CGPA)


# Lets start implementing our machine learning algorithms

In [7]:
from sklearn.model_selection import train_test_split
import pickle

### Impleamenting the linear regression algorithm
* we train our data 100 iterations 
* we pick the best result

In [33]:
# import the linear regression module from sklearn
from sklearn import linear_model
from sklearn.model_selection import train_test_split



best = 0
for _ in range(1000):    
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.1)
    linear =linear_model.LinearRegression()
    linear.fit(x_train,y_train)
    linear_score = linear.score(x_test,y_test)
    if linear_score > best:
        best = linear_score
        with open("./output/linearmodel.pickle","wb") as f:
            pickle.dump(linear,f)
print(best)

0.4424655192302943


### Predicting the classes for test set

In [63]:
Y_pred = linear.predict(x_test)


y_pred = np.round_(Y_pred, decimals=0, out=None)
    
print ("Predicted Value:",y_pred)

Predicted Value: [2. 2. 2. 3. 2. 2. 2. 2. 3. 3. 3. 2. 3. 2. 3. 2. 2. 3. 3. 2. 3. 3. 2. 2.]


### Calculating the accuracy of the predictions

In [51]:
from sklearn.metrics import mean_squared_error,mean_squared_log_error,confusion_matrix
print("The mean_squared_error of the linnear regression is: {}".format(mean_squared_error(y_test, y_pred)))
print("mean_squared_log_error of the linnear regression is: {}".format(mean_squared_log_error(y_test, y_pred)))
# outputting more info about our linear regression model
print('coeffitient:\n',linear.coef_)
print('Intercept:\n',linear.intercept_)
# variance score: 1 means perfect prediction
print('Variance score: {}'.format(linear.score(x_test, y_test)))

The mean_squared_error of the linnear regression is: 1.371331164910611
mean_squared_log_error of the linnear regression is: 0.09959130472468092
coeffitient:
 [-0.08217557 -0.12219893  0.17280244  0.0663378   0.22001124  0.22944272]
Intercept:
 1.8119863177146578
Variance score: -0.2735165225956233


#### loading linear regression model so we dont have to rerun our algorithm 

In [21]:
pickle_in = open("./output/linearmodel.pickle","rb")
linear = pickle.load(pickle_in)