In [1]:
import pandas as pd
import numpy as np
import logging
logging.basicConfig(filename='file.log',level=logging.DEBUG,format='%(asctime)s %(levelname)s %(message)s')

In [2]:

def loadData(trainingFile, testingFile):
    logging.info('loading the data....')
    def convertDataframe(inputFile):
    
        data = pd.DataFrame(columns=range(100000))
        logging.info("created an empty dataframe of 100000 features")    
        for i in range(len(inputFile)):
            record = np.fromstring(inputFile[i], dtype=int, sep=' ')
            record_bool = [0 for j in range(100000)]
            for col in record:
                record_bool[col-1] = 1
            
            data.loc[i] = record_bool
        logging.info('all the entries are pushed into the dataframe successfully')    
        return data
    
    with open(trainingFile, "r") as fr1:
        trainFile = fr1.readlines()
    
    #Split each line in the two files into label and data  
    train_data_list = []
    train_labels_list = []
    
    for inputData in trainFile:
        train_labels_list.append(inputData[0])
        
        #Remove the activity label (0/1) and new line character from each record
        inputData = inputData.replace("0\t", "")
        inputData = inputData.replace("1\t", "")
        inputData = inputData.replace("\n", "")
        train_data_list.append(inputData)
    
    train_labels = np.asarray(train_labels_list)
    train_data = convertDataframe(train_data_list)
        
    with open(testingFile, "r") as fr2:
        testFile = fr2.readlines()
    
    test_data = convertDataframe(testFile)
    logging.info('all the files are loaded successfully and splitted into train data, valid and train label')        
    return train_data, test_data, train_labels

In [3]:
train_data,valid_data,train_label = loadData('dorothea_train.data','dorothea_valid.data')

In [4]:
valid_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,99990,99991,99992,99993,99994,99995,99996,99997,99998,99999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
345,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
346,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
347,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
348,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler((0,1))
train_data=scaler.fit_transform(train_data)

In [6]:

valid_data=scaler.fit_transform(valid_data)

In [7]:
from sklearn.decomposition import PCA
logging.info('decomposing our data into 5 components')
decomposer = PCA(n_components=5)
train_data = decomposer.fit_transform(train_data)
valid_data=decomposer.fit_transform(valid_data)

In [8]:
f = open('dorothea_train.labels','r')
train_label = [int(i[:-1]) for i in f.readlines()]
train_label = np.array(train_label)
train_label[train_label==-1]=0
train_label.shape

(800,)

In [9]:
f = open('dorothea_valid.labels','r')
valid_label = [int(i[:-1]) for i in f.readlines()]
valid_label = np.array(valid_label)
valid_label[valid_label==-1]=0
valid_label.shape

(350,)

In [10]:
valid_label

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [11]:
train_data.shape

(800, 5)

In [12]:
train_data[0]

array([-1.12591203,  0.12833059, -0.10287597, -0.31350669, -0.15629315])

# model Building

In [13]:
# from sklearn.linear_model import LogisticRegression
# logging.info('checking our model with logistic regression')
# model = LogisticRegression()
# model.fit(train_data,train_label)
# logging.info('model fitted successfully ')
# train_pred = model.predict(train_data)
# valid_pred=model.predict(valid_data)
# from sklearn.metrics import confusion_matrix, f1_score
# logging.info(f'This is for train metrix \n{confusion_matrix(train_label,train_pred)}')
# logging.info(f'this is for testing metrix \n{confusion_matrix(valid_label,valid_pred)}')
# logging.info(f"This is f1 score for train data: {f1_score(train_label,train_pred)}")
# logging.info(f"This is f1 score for valid data: {f1_score(valid_label,valid_pred)}")

In [14]:
train_label.shape

(800,)

## model evaluation

In [15]:
train_data.shape

(800, 5)

In [16]:
train_label.shape

(800,)

 # <p style='color:red'>  DECISION TREE </p>

In [17]:
# from sklearn.neighbors import KNeighborsClassifier
# model2=KNeighborsClassifier
# logging.info('checking our model with KNeighbors CLASSIFIER')
# knn=model2(n_neighbors=7)
# knn.fit(train_data,train_label)
# logging.info('model fitted successfully ')
# train_pred = knn.predict(train_data)
# valid_pred=knn.predict(valid_data)
# from sklearn.metrics import confusion_matrix, f1_score
# logging.info(f'This is for train metrix \n{confusion_matrix(train_label,train_pred)}')
# logging.info(f'this is for testing metrix \n{confusion_matrix(valid_label,valid_pred)}')
# logging.info(f"This is f1 score for train data: {f1_score(train_label,train_pred)}")
# logging.info(f"This is f1 score for valid data: {f1_score(valid_label,valid_pred)}")

In [18]:
train_data.shape

(800, 5)

In [19]:
train_data[0]

array([-1.12591203,  0.12833059, -0.10287597, -0.31350669, -0.15629315])

In [20]:
# from sklearn.tree import DecisionTreeClassifier
# model3=DecisionTreeClassifier()
# logging.info('checking our model with DECISION TREE CLASSIFIER')
# model3.fit(train_data,train_label)
# logging.info('model fitted successfully ')
# train_pred = model3.predict(train_data)
# valid_pred=model3.predict(valid_data)
# from sklearn.metrics import confusion_matrix, f1_score
# logging.info(f'This is for train metrix \n{confusion_matrix(train_label,train_pred)}')
# logging.info(f'this is for testing metrix \n{confusion_matrix(valid_label,valid_pred)}')
# logging.info(f"This is f1 score for train data: {f1_score(train_label,train_pred)}")
# logging.info(f"This is f1 score for valid data: {f1_score(valid_label,valid_pred)}")

In [21]:
# from sklearn.svm import SVC
# model4=SVC()
# logging.info('checking our model with SVM CLASSIFIER')
# model4.fit(train_data,train_label)
# logging.info('model fitted successfully ')
# train_pred = model4.predict(train_data)
# valid_pred=model4.predict(valid_data)
# from sklearn.metrics import confusion_matrix, f1_score
# logging.info(f'This is for train metrix \n{confusion_matrix(train_label,train_pred)}')
# logging.info(f'this is for testing metrix \n{confusion_matrix(valid_label,valid_pred)}')
# logging.info(f"This is f1 score for train data: {f1_score(train_label,train_pred)}")
# logging.info(f"This is f1 score for valid data: {f1_score(valid_label,valid_pred)}")

In [22]:
unique_values, counts = np.unique(train_label, return_counts=True)

In [23]:
for value, count in zip(unique_values, counts):
    print(f"{value} occurs {count} times")

0 occurs 722 times
1 occurs 78 times


In [24]:
# from sklearn.model_selection import GridSearchCV, StratifiedKFold
# from sklearn.ensemble import RandomForestClassifier

# def find_best_hyperparameters(X, y, param_grid, n_splits=5, random_state=42):

#   # Create a StratifiedKFold object
#   skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

#   # Create a Random Forest Classifier
#   rf_classifier = RandomForestClassifier(random_state=random_state)

#   # Create a Grid Search CV object
#   grid_search = GridSearchCV(estimator=rf_classifier, 
#                              param_grid=param_grid, 
#                              cv=skf, 
#                              scoring='accuracy', 
#                              n_jobs=-1) 

#   # Fit the Grid Search CV object to the data
#   grid_search.fit(X, y)

#   # Get the best hyperparameters and score
#   best_params = grid_search.best_params_
#   best_score = grid_search.best_score_

#   return best_params, best_score

# # Example usage
# # Define the hyperparameter grid
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [None, 5, 10],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# # Find the best hyperparameters
# best_params, best_score = find_best_hyperparameters(train_data, train_label, param_grid)

# print("Best Hyperparameters:", best_params)
# print("Best Cross-Validation Score:",best_score)

In [25]:
# from sklearn.ensemble import RandomForestClassifier
# model5=RandomForestClassifier(n_estimators=300,max_depth=5)
# logging.info('checking our model with RANDOM FOREST')
# model5.fit(train_data,train_label)
# logging.info('model fitted successfully ')
# train_pred = model5.predict(train_data)
# valid_pred=model5.predict(valid_data)
# from sklearn.metrics import confusion_matrix, f1_score
# logging.info(f'This is for train metrix \n{confusion_matrix(train_label,train_pred)}')
# logging.info(f'this is for testing metrix \n{confusion_matrix(valid_label,valid_pred)}')
# logging.info(f"This is f1 score for train data: {f1_score(train_label,train_pred)}")
# logging.info(f"This is f1 score for valid data: {f1_score(valid_label,valid_pred)}")

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score

def find_best_hyperparameters(X, y, model_name, param_grid, n_splits=10, random_state=42):
  """
  Finds the best hyperparameters for a given model using Grid Search CV.

  Args:
    X: Features (independent variables).
    y: Target variable (dependent variable).
    model_name: Name of the model ('logistic_regression', 'decision_tree', 
                'random_forest', 'knn', 'svc').
    param_grid: Dictionary of hyperparameters to tune and their possible values.
    n_splits: Number of folds for cross-validation.
    random_state: Seed for reproducibility.

  Returns:
    best_params: Dictionary containing the best hyperparameters found.
    best_score: Best cross-validation score.
  """

  skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

  if model_name == 'logistic_regression':
    model = LogisticRegression()
  elif model_name == 'decision_tree':
    model = DecisionTreeClassifier(random_state=random_state,max_features=5)
  elif model_name == 'random_forest':
    model = RandomForestClassifier(random_state=random_state)
  elif model_name == 'knn':
    model = KNeighborsClassifier()
  elif model_name == 'svc':
    model = SVC(random_state=random_state)
  else:
    raise ValueError("Invalid model_name. Choose from 'logistic_regression', 'decision_tree', 'random_forest', 'knn', 'svc'.")

  grid_search = GridSearchCV(estimator=model, 
                             param_grid=param_grid, 
                             cv=skf, 
                             scoring='f1', 
                             n_jobs=-1)

  grid_search.fit(X, y)
  best_params = grid_search.best_params_
  best_score = grid_search.best_score_

  return best_params, best_score

# Example usage
# Define hyperparameter grids for each model
param_grids = {
    #'logistic_regression': {'C':[i for i in range(1,3)],'penalty':['l2'],'solver':['lbfgs', 'liblinear']},
    #'decision_tree': {'max_depth': [i for i in range(1,7)], 'min_samples_split': [i for i in range(2,7)], 'min_samples_leaf': [i for i in range(1,7)],'max_leaf_nodes':[i for i in range(2,7)],'criterion':["gini", "entropy", "log_loss"] , 'splitter':["best", "random"]},
     'random_forest': {'n_estimators': [i for i in range(1000)], 'max_depth': [i for i in range(2,7)],'min_samples_split': [i for i in range(2,10)], 'min_samples_leaf': [i for i in range(1,7)],'max_leaf_nodes':[i for i in range(2,7)],'criterion':["gini", "entropy", "log_loss"]},
     'knn': {'n_neighbors': [i for i in range(20)],"weights":['uniform','distance'],"algorithm":['auto', 'ball_tree', 'kd_tree', 'brute'],"leaf_size":[i for i in range(100)],'p':[1,2]},
    # 'svc': {'C': [i for i in range(100)], "kernel":['linear','poly', 'rbf', 'sigmoid', 'precomputed'],'degree':[i for i in range(10)],'gamma':['scale','auto'],'probability':[True,False]}
}

# Find best hyperparameters for each model
for model_name, param_grid in param_grids.items():
  best_params, best_score = find_best_hyperparameters(train_data, train_label, model_name, param_grid)
  print(f"Best Hyperparameters for {model_name}:", best_params)
  print(f"Best Cross-Validation Score for {model_name}:", best_score)

Best Hyperparameters for logistic_regression: {'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Cross-Validation Score for logistic_regression: 0.28391053391053395
Best Hyperparameters for decision_tree: {'criterion': 'gini', 'max_depth': 2, 'max_leaf_nodes': 3, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}
Best Cross-Validation Score for decision_tree: 0.5787435500515996