In [1]:
import pandas as pd
import numpy as np
import logging
logging.basicConfig(filename='file.log',level=logging.DEBUG,format='%(asctime)s %(levelname)s %(message)s')

In [2]:

def loadData(trainingFile, testingFile):
    logging.info('loading the data....')
    def convertDataframe(inputFile):
    
        data = pd.DataFrame(columns=range(100000))
        logging.info("created an empty dataframe of 100000 features")    
        for i in range(len(inputFile)):
            record = np.fromstring(inputFile[i], dtype=int, sep=' ')
            record_bool = [0 for j in range(100000)]
            for col in record:
                record_bool[col-1] = 1
            
            data.loc[i] = record_bool
        logging.info('all the entries are pushed into the dataframe successfully')    
        return data
    
    with open(trainingFile, "r") as fr1:
        trainFile = fr1.readlines()
    
    #Split each line in the two files into label and data  
    train_data_list = []
    train_labels_list = []
    
    for inputData in trainFile:
        train_labels_list.append(inputData[0])
        
        #Remove the activity label (0/1) and new line character from each record
        inputData = inputData.replace("0\t", "")
        inputData = inputData.replace("1\t", "")
        inputData = inputData.replace("\n", "")
        train_data_list.append(inputData)
    
    train_labels = np.asarray(train_labels_list)
    train_data = convertDataframe(train_data_list)
        
    with open(testingFile, "r") as fr2:
        testFile = fr2.readlines()
    
    test_data = convertDataframe(testFile)
    logging.info('all the files are loaded successfully and splitted into train data, valid and train label')        
    return train_data, test_data, train_labels

In [3]:
train_data,valid_data,train_label = loadData('dorothea_train.data','dorothea_valid.data')

In [4]:
valid_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
346,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
347,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
348,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
from sklearn.decomposition import PCA
logging.info('decomposing our data into 5 components')
decomposer = PCA(n_components=5)
train_data = decomposer.fit_transform(train_data)
valid_data=decomposer.fit_transform(valid_data)

In [6]:
f = open('dorothea_train.labels','r')
train_label = [int(i[:-1]) for i in f.readlines()]
train_label = np.array(train_label)
train_label[train_label==-1]=0
train_label.shape

(800,)

In [7]:
f = open('dorothea_valid.labels','r')
valid_label = [int(i[:-1]) for i in f.readlines()]
valid_label = np.array(valid_label)
valid_label[valid_label==-1]=0
valid_label.shape

(350,)

In [8]:
valid_label

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [9]:
train_data.shape

(800, 5)

In [10]:
train_data[0]

array([-1.12587619,  0.12839593, -0.10299927, -0.31720426, -0.16558635])

# model Building

In [11]:
from sklearn.linear_model import LogisticRegression
logging.info('checking our model with logistic regression')
model = LogisticRegression()
model.fit(train_data,train_label)
logging.info('model fitted successfully ')

In [12]:
train_label.shape

(800,)

In [13]:
train_pred = model.predict(train_data)

In [14]:
valid_pred=model.predict(valid_data)

## model evaluation

In [15]:
from sklearn.metrics import confusion_matrix, f1_score

In [22]:
logging.info(f'This is for train metrix \n{confusion_matrix(train_label,train_pred)}')

In [24]:
logging.info(f'this is for testing metrix \n{confusion_matrix(valid_label,valid_pred)}')

In [25]:
logging.info(f"This is f1 score for train data: {f1_score(train_label,train_pred)}")
logging.info(f"This is f1 score for valid data: {f1_score(valid_label,valid_pred)}")