In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('ODI-2019-processed.csv')

In [None]:
mapping = {None:['unknown'], int(1):['yes','mu','ja','male','1'], int(0):['no','sigma','nee','female', '0']}
for key in mapping:
    data[data.columns[[1,2,3,4]]] = data[data.columns[[1,2,3,4]]].replace(to_replace = mapping[key], value = key)

In [None]:
test_data = data[data.columns[[1,2,3,4,9]]].copy()
test_data[test_data.columns[1]] = test_data[test_data.columns[1]].astype(str).astype(int)

In [None]:
old_labels = list(test_data.columns.values)
new_labels = ['ML', 'IR', 'ST', 'DB', 'Programme']
mapping = dict( zip(old_labels,new_labels) )
test_data = test_data.rename(mapping, axis='columns')

In [None]:
labels = test_data['Programme'].unique().tolist()
mapping = dict( zip(labels,range(len(labels))) )
test_data.replace({'Programme': mapping},inplace=True)

# Pairplots

In [None]:
sns.pairplot(test_data)

# Correlation

In [None]:
corr = test_data.corr()
corr

In [None]:
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

f, ax = plt.subplots(figsize=(11, 9))

cmap = sns.diverging_palette(220, 10, as_cmap=True)

sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
def regression(X_train, X_test, y_train, y_test):
    lm = LinearRegression()
    lm.fit(X_train,y_train)

    X_test_data = X_test.copy()

    X_test_data['predictions'] = lm.predict(X_test).round()

    X_test_data['predictions'] = X_test_data['predictions'].astype(int)
    
    return X_test_data

In [None]:
def testregression(X_test_data, test_data, prediction_column):
    count = 0
    for x in range(len(test_data)):
        try:
            if X_test_data.loc[x]['predictions']==test_data.loc[x][prediction_column]:
                count += 1
        except:
            pass

    return count

In [None]:
def kfoldlearning(X,y,prediction_column, folds = 2):
    # scikit-learn k-fold cross-validation
    from sklearn.model_selection import KFold
    performances = []

    # prepare cross validation
    kfold = KFold(folds, True)
    # enumerate splits
    for train_index, test_index in kfold.split(X, y):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        X_test_data = regression(X_train, X_test, y_train, y_test)
        count = testregression(X_test_data, test_data, prediction_column)
        print(str(count) + " correct out of " + str(len(X_test_data)))
        performances.append(count/len(X_test_data))

    print("\n" + str((np.mean(performances)*100).round(1)) + "% accuracy")

## Linear Regression Programmes

In [None]:
X = test_data[test_data.columns[[0,1,2,3]]]
y = test_data[test_data.columns[4]]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=101)

In [None]:
X_test_data = regression(X_train, X_test, y_train, y_test)
count = testregression(X_test_data, test_data, 'Programme')
str(count) + " correct out of " + str(len(X_test_data))

## K-fold Programmes

In [None]:
# Predict programme based on ML, IR, ST and DB
X = test_data[test_data.columns[[0,1,2,3]]]
y = test_data[test_data.columns[4]]
folds = 7
kfoldlearning(X,y,'Programme',folds)

## K-fold Machine Learning

In [None]:
# Predict ML on IR, ST and DB
X = test_data[test_data.columns[[1,2,3]]]
y = test_data[test_data.columns[0]]
folds = 7
kfoldlearning(X,y,'ML',folds)