In [None]:
import gc

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix, plot_precision_recall_curve, plot_roc_curve

from sklearn.model_selection import train_test_split as tts

from sklearn.metrics import accuracy_score, precision_score

from sklearn.model_selection import cross_val_score

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

In [None]:
class Learner():
    def __init__(self, classes : tuple, n_estimators : tuple, max_depths : tuple, train_df : pd.DataFrame, test_df : pd.DataFrame, submission_df : pd.DataFrame):
        self.classes = classes
        self.n_estimators = n_estimators
        self.max_depths = max_depths
        self.train_df = train_df
        self.test_df = test_df
        self.submission_df = submission_df

        self.scored_classifiers = []
        self.classification_reports = []
    
    def split_dataset(self):        
        self.train_input, self.validation_input, self.train_output, self.validation_output = tts(self.train_df.drop(columns=["id", "status_group"]), self.train_df["status_group"], test_size=0.3)
        return self.train_input, self.validation_input, self.train_output, self.validation_output
    
    def create_file(self, predicited_output, file_num : str):
        if predicited_output.shape[0] == self.submission_df.shape[0]:
            print(True)
            submission = self.submission_df.copy()
            for (idx,data) in submission["status_group"].iteritems():
                submission["status_group"][idx] = self.classes[predicited_output[idx]]                
            submission.to_csv("./datasets/submissions/submission_" + file_num + ".csv", index=False, header=True)
            del submission
            gc.collect()
    
    def create_classifier(self, depth : int, estimator : int):
        classifer_model = RandomForestClassifier(max_depth=depth, n_estimators=estimator, n_jobs=-1)
        return classifer_model
    
    def train_model(self):
        dataset = self.split_dataset()
        for estimator in n_estimators:
            for max_depth in max_depths:
                classifier = self.create_classifier(depth=max_depth, estimator=estimator)
                classifier.fit(dataset[0], dataset[2])
                predicted_output = classifier.predict(dataset[1])
                acc_score = accuracy_score(predicted_output, dataset[3])

                self.classification_reports.append(self.create_classification_report(dataset[3], predicted_output)                )
                self.scored_classifiers.append((estimator, max_depth, classifier, acc_score))
    
    def test_model(self, classifier : RandomForestClassifier,file_num : int,  create_file=True):
        self.test_df = self.test_df.fillna(self.test_df.median())
        predicted_output = classifier.predict(self.test_df.drop(columns=["id"]))
        if create_file:
            self.create_file(predicted_output, str(file_num))
        return predicted_output
    
    def plot_confusion_matrix(self, classifier : RandomForestClassifier):
        plot_conf_matrix = plot_confusion_matrix(classifier, self.validation_input, self.validation_output, display_labels=classes, cmap=plt.cm.Blues, normalize='true') 
        return plot_conf_matrix
    
    def create_classification_report(self, validation_output, predicted_output):
        report = classification_report(validation_output, predicted_output, output_dict=True)   
        return pd.DataFrame(report).transpose()
    
    def get_cross_validation_score(self, classifier : RandomForestClassifier):
        return cross_val_score(classifier, self.validation_input, self.validation_output, cv=3)

## Public Variables

In [None]:
classes = ("functional", "functional needs repair","non functional")
n_estimators = (1000, 1200)
max_depths = (40, 45, 50)

In [None]:
train_df = pd.read_csv("./datasets/preprocessed_data/merged_train.csv")
test_df = pd.read_csv("./datasets/preprocessed_data/test.csv")
submission_df = pd.read_csv("./datasets/SubmissionFormat.csv")

In [None]:
train_df.drop('date_recorded', axis=1, inplace=True)
test_df.drop('date_recorded', axis=1, inplace=True)
#test_df.drop('status_group',axis=1, inplace=True)

In [None]:
train_df.head(5)

In [None]:
test_df.head(5)

In [None]:
submission_df.head(5)

## Train the Model then predict the test values

In [None]:
learner = Learner(classes=classes, n_estimators=n_estimators, max_depths=max_depths, train_df=train_df, test_df=test_df, submission_df=submission_df)


In [None]:
learner.train_model()

In [None]:
for report in learner.classification_reports:
    print(report)
    print()
    print("*" * 90)
    print()

In [None]:
classifiers = learner.scored_classifiers

In [None]:
for tuple_item in classifiers:
    plot = learner.plot_confusion_matrix(classifier=tuple_item[2])
plt.show()

In [None]:
#for tuple_item in classifiers:
    #plot = learner.get_cross_validation_score(classifier=tuple_item[2])

In [None]:
for tuple_item in classifiers:
    predicted_output = learner.test_model(classifier=tuple_item[2], file_num=tuple_item[0]+tuple_item[1])