In [47]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [48]:
def split_trainingData(predictors,target,train_percentage):
    """
    Split the dataset with train_percentage
    :param dataset:
    :param train_percentage:
    :param feature_headers:
    :param target_header:
    :return: train_x, test_x, train_y, test_y
    """
 
    # Split dataset into train and test dataset
    X_train, X_test, Y_train, Y_test = train_test_split(predictors, target, train_size=train_percentage)
    return X_train, X_test, Y_train, Y_test

In [49]:
def predictors_target(data_Train):
    """
    Split the predictors from target values in training dataset
    
    :param training Dataset
    """
    
    predictors = data_Train.iloc[:,1:12]
    target = data_Train.iloc[:,12]
    
    return predictors, target
    

In [50]:
def predictors_test(data_Test):
    """
    Get the predictors in testing dataset
    
    :param testing Dataset
    """
    
    tsetPredictors = data_Test.iloc[:,1:12]
    
    return tsetPredictors

In [71]:
def random_forest_classifier(features, target):
    """
    To train the random forest classifier with features and target data
    :param features:
    :param target:
    :return: trained random forest classifier
    """

    
    param_grid = [
    {
        'n_estimators':list(range(50,510,50)),
        #'max_depth': list(range(50,160,10))
    }]
    
    grid = GridSearchCV(RandomForestClassifier(),cv=5, n_jobs=1, param_grid=param_grid, verbose=5, scoring='accuracy')
    grid.fit(features,target)
    results = pd.DataFrame(grid.cv_results_)
    results.sort_values(by='rank_test_score', inplace=True)
    results.head(5)
    optimised_random_forest = grid.best_estimator_
    return grid

In [72]:
def writeCSV(ID,data,outputFile):
    """
    Write to a csv file
    
    :param ID -->id column 
    :param data -->data to write on a csv file
    :param outputFile --> output file name to write data
    """
    
    df=pd.DataFrame()
    df['Id']=ID
    df['type']=data
    df.to_csv(outputFile,index=0)

In [76]:
# Main Function
def main():
    """
    Main function
    :return:
    """
    # Load the csv file into pandas dataframe
    data_Samp=pd.read_csv("C:/Users/Ali/Documents/GitHub/ECEN689-Fall2018/Challenges/4Files/winequality-combined-sample.csv")
    data_Train=pd.read_csv("C:/Users/Ali/Documents/GitHub/ECEN689-Fall2018/Challenges/4Files/winequality-combined-training.csv")
    data_Test=pd.read_csv("C:/Users/Ali/Documents/GitHub/ECEN689-Fall2018/Challenges/4Files/winequality-combined-testing.csv")
    
    # Split Data for Predictors and Target
    predictors,target= predictors_target(data_Train)
    
    # Split Training data into train and test datasets
    X_train, X_test, Y_train, Y_test= split_trainingData(predictors, target, 0.8)
   
    # Training a model. Change the Function Name for the model you want
    trained_model= random_forest_classifier(X_train,Y_train)
    print("@@@@@@@@@@@@@@@@@@",trained_model.best_estimator_)
    print("$$$$$$$$$$$$$$$$$$",trained_model.best_score_)
    
    # Prediction on Validation Data
    Y_pred= trained_model.predict(X_test)
    
    # Calculating Accuracy on Validation data set
    score = accuracy_score(Y_test,Y_pred)
    print("*****************",score)
    
    # Prediction on Test Data
    X_train=data_Train.iloc[:,1:12]
    Y_train=data_Train.iloc[:,12]
    testPredictors=predictors_test(data_Test)

    trained_model.fit(X_train,Y_train)
    Y_pred_test= trained_model.predict(testPredictors)
    
    # Writing to a csv file
    outputFile='winequality-combined-solution_V4.csv'
    writeCSV(data_Test['Id'],Y_pred_test,outputFile)
    
    
    
if __name__ == "__main__":
    main()



Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=50 .................................................
[CV] ........ n_estimators=50, score=0.9938800489596084, total=   0.1s
[CV] n_estimators=50 .................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV] ......... n_estimators=50, score=0.992638036809816, total=   0.1s
[CV] n_estimators=50 .................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s


[CV] ........ n_estimators=50, score=0.9938650306748467, total=   0.1s
[CV] n_estimators=50 .................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.6s remaining:    0.0s


[CV] ........ n_estimators=50, score=0.9938650306748467, total=   0.1s
[CV] n_estimators=50 .................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.8s remaining:    0.0s


[CV] ........ n_estimators=50, score=0.9889570552147239, total=   0.2s
[CV] n_estimators=100 ................................................
[CV] ......... n_estimators=100, score=0.99265605875153, total=   0.3s
[CV] n_estimators=100 ................................................
[CV] ....... n_estimators=100, score=0.9914110429447853, total=   0.3s
[CV] n_estimators=100 ................................................
[CV] ....... n_estimators=100, score=0.9950920245398773, total=   0.3s
[CV] n_estimators=100 ................................................
[CV] ........ n_estimators=100, score=0.996319018404908, total=   0.3s
[CV] n_estimators=100 ................................................
[CV] ....... n_estimators=100, score=0.9914110429447853, total=   0.3s
[CV] n_estimators=150 ................................................
[CV] ....... n_estimators=150, score=0.9938800489596084, total=   0.5s
[CV] n_estimators=150 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   56.1s finished


@@@@@@@@@@@@@@@@@@ RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
$$$$$$$$$$$$$$$$$$ 0.9933774834437086
***************** 0.9970588235294118
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=50 .................................................
[CV] ........ n_estimators=50, score=0.9931372549019608, total=   0.1s
[CV] n_estimators=50 .................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s


[CV] ........ n_estimators=50, score=0.9950980392156863, total=   0.1s
[CV] n_estimators=50 .................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.6s remaining:    0.0s


[CV] ........ n_estimators=50, score=0.9941176470588236, total=   0.1s
[CV] n_estimators=50 .................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.8s remaining:    0.0s


[CV] ........ n_estimators=50, score=0.9960745829244357, total=   0.1s
[CV] n_estimators=50 .................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.1s remaining:    0.0s


[CV] ........ n_estimators=50, score=0.9970530451866405, total=   0.1s
[CV] n_estimators=100 ................................................
[CV] ....... n_estimators=100, score=0.9901960784313726, total=   0.4s
[CV] n_estimators=100 ................................................
[CV] ....... n_estimators=100, score=0.9950980392156863, total=   0.3s
[CV] n_estimators=100 ................................................
[CV] ....... n_estimators=100, score=0.9950980392156863, total=   0.3s
[CV] n_estimators=100 ................................................
[CV] ....... n_estimators=100, score=0.9960745829244357, total=   0.4s
[CV] n_estimators=100 ................................................
[CV] ....... n_estimators=100, score=0.9980353634577603, total=   0.3s
[CV] n_estimators=150 ................................................
[CV] ....... n_estimators=150, score=0.9911764705882353, total=   0.5s
[CV] n_estimators=150 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  1.1min finished
