##### Importing required packages for the program

In [None]:
# time for calculating running times of the classifiers
import time

# numpy for handling arrays
import numpy as np

# pandas for working with huge data
import pandas as pd

# SMOTE for over sampling
from imblearn.over_sampling import SMOTE

# test_train_split for splitting the data
from sklearn.model_selection import train_test_split 

# PCA for dimensionality reduction
from sklearn.decomposition import PCA

#f1_score to calculate f1 score
from sklearn.metrics import f1_score

: 

##### Function to convert the feature data into matrix or tabular format

In [None]:
# setting the number of features as 100000 as we have those many number of features in the given traing dataset
num_features = 100000

# function to plot the data at the give feature value, making a mapping table or matrix we can say
def to_tabular_data(file_passed):

    # creating a data frame with columns with range 100000
    mapping_table = pd.DataFrame(columns=range(num_features))
    
    # for loop to plot the values at given feature index
    for i in range(len(file_passed)):

        # forming an array for each line in file_passed
        line_array = np.fromstring(file_passed[i], dtype=int, sep=' ')

        # forming a table with values 0 for 100000 columns
        feature_matrix = [0] * num_features
        
        # plotting the value at the index of each feature value
        for attribute in line_array:
            feature_matrix[attribute-1] = 1
        mapping_table.loc[i] = feature_matrix

    # returning the table in which each feature available is mapped to one        
    return mapping_table 

: 

##### Function to retrieve the class labels from the training data file

In [None]:
# function to get the training data labels
def get_train_labels(file_passed):

    #creating an empty list called labels
    labels=[]

    # using file handle object to open text file containg training data and opening it in read mode
    f1 = open(file_passed, "r")

    # reading the lines in the text file using readlines() method
    gathered_data = f1.readlines()
    
    # appening the first value of each and evry line in all lines read above as those are the labels
    for line in gathered_data:
        labels.append(line[0])
        
    # after done with using the file we close the file handle
    f1.close()
    
    # returning labels after filtering
    return labels

: 

##### Function to retrieve the features data from the training data file

In [None]:
# function to get the training data features
def get_traitotal_ft(file_passed):

    #creating an empty list called features to store feature values
    features=[]

    # using file handle object to open text file containg training data and opening it in read mode
    f2 = open(file_passed, "r")

    # reading the lines in the text file using readlines() method
    gathered_data = f2.readlines()
    
    for line in gathered_data:

        # for each and every line in all lines in text file we clear/replace the '0\t', '1\t' and '\n' as those are labels and escape characters to have only feature data in file
        line=line.replace("0\t","")
        line=line.replace("1\t","")
        line=line.replace("\n","")
        features.append(line)

    # closing the file handle after we are done
    f2.close()
    
    # calling the to_tabular_data function to map the feature values
    matrix_feature_data = to_tabular_data(features)
    # returning feature data
    return matrix_feature_data

: 

##### Function to retrieve the features from the test data file which will be used to predict the class labels after training the model

In [None]:
# function to get test features, this test data set has no class labels
def get_test_features(file_passed):

    #creating an empty list called features to store feature values
    features=[]

    # using file handle object to open text file containg training data and opening it in read mode
    f2 = open(file_passed, "r")

    # reading the lines in the text file using readlines() method
    gathered_data = f2.readlines()

    for line in gathered_data:

        # for each and evry line in all lines in text file we clear/replace the '\n' as those are escape characters to have only feature data in file
        line=line.replace("\n","")
        features.append(line)

    # closing the file handle after we are done
    f2.close()
    
    # calling the to_tabular_data function to map the feature values
    matrix_feature_data = to_tabular_data(features)
    # returning the test features obtained from the testing file
    return matrix_feature_data

: 

##### Function to perform the PCA on the data to reduce the dimensionality reduction of the sparse data

In [None]:
# function to perform dimensionality reduction on the training data and test data
def perform_pca_on_data(x_train, x_test, test_feat_passed, no_of_components):

    # creating the TruncatedSVD object called dimension_reduction
    dimension_reduction = PCA(n_components = no_of_components)

    # using the above created object to perform fit_transform on the X_train data
    red_x_train_data = dimension_reduction.fit_transform(x_train)

    # using the above created object to perform transform on the X_test data
    red_x_test_data = dimension_reduction.transform(x_test)

    # using the above created object to perform transform on the testing dataset feature data
    red_test_data = dimension_reduction.transform(test_feat_passed)
    
    # returning the dimensionality reduced variables
    return red_x_train_data, red_x_test_data, red_test_data

: 

##### A class to represent the decision tree having only length one, also called stumps or weak classifier

In [None]:
# defining the weka classifier class, also called as stumps which means the decision tree is only of length 1.
class treeDepthOne:
    # defining the inint method
    def __init__(self):
        # polarit fro checking with respect to 1 
        self.posorneg = 1
        # initially all the function variables are set to 1
        self.ftindx = None
        self.thrlimit = None
        self.votefunction = None

    # predict method for stump / weak classifier
    def predict(self, trd):
        total_lb = trd.shape[0]
        xdcol = trd[:, self.ftindx]
        preds = np.ones(total_lb)
        # writing the condition to classify the class labels for the data given based on threshold value
        if self.posorneg == 1:
            preds[xdcol < self.thrlimit] = -1
        else:
            preds[xdcol > self.thrlimit] = -1

        return preds

: 

##### A class defined to perform the Adaboost related functions and calculations

In [None]:
# function to normalize the weights
def nmToOne(asw):
    return (asw/(np.sum(asw)))

# defining class TreeBoost, using adaboost here 
class TreeBoost:
    def __init__(self, n_trab=5):
        self.n_trab = n_trab
        self.trabs = []

    def fit(self, trd, trl):
        total_lb, total_ft = trd.shape
        print(total_ft)

        # Initialize weights to 1/N
        assigned_weights = np.full(total_lb, (1 / total_lb))

        self.trabs = []
        ix=0
        while(ix<self.n_trab):
            trab = treeDepthOne()
            mm = float("inf")
            iy=0

            # finding best feature and limits
            while(iy<total_ft):
                xdcol = trd[:, iy]
                thrlimits = np.unique(xdcol)

                for thrlimit in thrlimits:
                    v = 1
                    preds = np.ones(total_lb)
                    preds[xdcol < thrlimit] = -1

                    # mistake = sum of weights of classified_wrong samples
                    classified_wrong = assigned_weights[trl != preds]
                    mistake = sum(classified_wrong)

                    if mistake > 0.5:
                        mistake = 1 - mistake
                        v = -1

                    # storing most favourable combo and updating values based on that
                    if mistake < mm:
                        trab.posorneg = v
                        trab.thrlimit = thrlimit
                        trab.ftindx = iy
                        mm = mistake
                iy=iy+1

            # calculate votefunction
            trab.votefunction = 0.5 * np.log((1.0 - mm + (0.0000000001)) / (mm + (0.0000000001)))

            # calculate preds and update weights
            preds = trab.predict(trd)

            assigned_weights = assigned_weights * np.exp(-trab.votefunction * trl * preds)
            
            # Normalizing weights
            as_wei = assigned_weights
            assigned_weights = nmToOne(as_wei)

            # Saving thet particular classifier
            self.trabs.append(trab)

            ix=ix+1
    
    # predict function to calculate pred values with updated weights 
    def predict(self, trd):
        trab_preds = [trab.votefunction * trab.predict(trd) for trab in self.trabs]
        y_pred = np.sum(trab_preds, axis=0)
        y_pred = np.sign(y_pred)

        return y_pred

: 

##### The program execution starts here, we load the train and test data from files

In [None]:
# storing the file training dataset filepath into a variable
train_filepath = "train.txt"

# storing the file testing dataset filepath into a variable
test_filepath = "test.txt"

# function to get the labels and converting it into numpy array
train_lab = np.asarray(get_train_labels(train_filepath))

# as the data is in strings we convert it to int
train_lab=train_lab.astype(int)

# as adaboost requires the input values to be (1, and -1) we are changing the 0 to 1's for time being
train_lab[train_lab==0]=-1


# function to get the training features
train_feat = get_traitotal_ft(train_filepath)

# function to get the testing features
test_feat = get_test_features(test_filepath)

: 

##### Splitting the data

In [None]:
# splitting the test data after cleaning, took a 80/20 split ratio here
X_train, X_test, y_train, y_test = train_test_split(train_feat, train_lab, test_size=0.2, random_state=42)

: 

##### Performing dimensionality reduction

In [None]:
# calling a function to reduce the dimensionality of the traing data into values of 500 for quick execution
reduced_x_train, reduced_x_test, reduced_test_ft = perform_pca_on_data(X_train, X_test, test_feat, 500)

: 

##### As the data is unbalanced, we perform oversampling to balance the data i.e, both the class labels

In [None]:
# SMOTE is most common method used to oversample the data when we get imbalanced data
# initializing a sampling object
oversampling = SMOTE()

# using the sampling object to re-sample the data which we get after dimensionality reduction
oversampled_train_ft, oversampled_train_labels = oversampling.fit_resample(reduced_x_train, y_train)

: 

##### Calling Adaboost

In [None]:
# Adaboost classification with 9 weak classifiers
trab = TreeBoost(n_trab=9)

# using time function to get the time for predicting and training the model.
s1 = time.time()
# training the model
trab.fit(oversampled_train_ft, oversampled_train_labels)
e1 = time.time()
print("\n Decision tree training time with Adaboost:", round(e1-s1, 3), "s")

s2 = time.time()
train_y_pred = trab.predict(reduced_x_test)
e2 = time.time()
print("\n Decision tree prediction time on training data with Adaboost:", round(e2-s2, 3), "s")

s3 = time.time()
test_y_pred = trab.predict(reduced_test_ft)
e3 = time.time()
print("\n Decision tree prediction time on test data with Adaboost:", round(e3-s3, 3), "s")

: 

##### Displaying the classification report for the above model

In [None]:
# importing classification_report from sklearn to get all F1-score, recall-values and precision values
from sklearn.metrics import classification_report

# naming the class labels as class 0 and class 1 for printing in classification report
label_names = ['class 0', 'class 1']

# printing classification report for decision tree with adaboost
print("\tDecision Tree using AdaBoost:\n")
print(classification_report(train_y_pred,y_test, target_names=label_names))

print("\nF1-score:",f1_score(y_test, train_y_pred))


: 

In [None]:
# after calculating the F1-score we want the -1 values to be chnaged back to 0 for printing to the output file
test_y_pred[test_y_pred==-1]=0
test_y_pred = test_y_pred.astype(int)

: 

##### Writing the predicted values for the test file into a text file

In [None]:
# opening a file using file object to print preds made by Neural Network
file = open("decision_tree.txt", "w")

# using for loop to loop through prediction values
for pred in test_y_pred:
        file.write(str(pred)+"\n")

# closing file object after pritning
file.close()

: 