In [1]:
# Import necessary libraries
import numpy as np # For numerical operations
import pandas as pd # For data manipulation and analysis
import matplotlib.pyplot as plt # For creating visualizations
import seaborn as sns # For enhanced visualizations

In [3]:
# Load the training data from a CSV file into a Pandas DataFrame
df = pd.read_csv('train.csv')

In [4]:
# Display the first few rows of the DataFrame to inspect the data
df.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Import the train_test_split function from scikit-learn
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
# X_train: Features for training
# X_test: Features for testing
# y_train: Target variable for training
# y_test: Target variable for testing
# df.iloc[:,1:]: Selects all rows and all columns starting from the second column (features)
# df.iloc[:,0]: Selects all rows and the first column (target variable)
# test_size=0.2: Specifies that 20% of the data should be used for testing
# random_state=2: Sets a seed for the random number generator to ensure reproducibility
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,1:],df.iloc[:,0],test_size=0.2,random_state=2)

In [6]:
# Import the LogisticRegression and DecisionTreeClassifier classes from scikit-learn
from sklearn.linear_model import LogisticRegression # For logistic regression model
from sklearn.tree import DecisionTreeClassifier # For decision tree model

In [7]:
# Create instances of the Logistic Regression and Decision Tree classifiers
clf1 = LogisticRegression() # Logistic Regression model
clf2 = DecisionTreeClassifier() # Decision Tree model

In [8]:
# Train the Logistic Regression and Decision Tree classifiers using the training data
clf1.fit(X_train,y_train) # Train Logistic Regression model
clf2.fit(X_train,y_train) # Train Decision Tree model

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
# Make predictions on the test set using the trained classifiers
y_pred1 = clf1.predict(X_test) # Predictions from Logistic Regression
y_pred2 = clf2.predict(X_test) # Predictions from Decision Tree

In [10]:
# Import the accuracy_score and confusion_matrix functions from scikit-learn
from sklearn.metrics import accuracy_score,confusion_matrix

# Evaluate the accuracy of the Logistic Regression and Decision Tree classifiers
print("Accuracy of Logistic Regression",accuracy_score(y_test,y_pred1)) # Print Logistic Regression accuracy
print("Accuracy of Decision Trees",accuracy_score(y_test,y_pred2)) # Print Decision Tree accuracy

Accuracy of Logistic Regression 0.9145238095238095
Accuracy of Decision Trees 0.8541666666666666


In [11]:
# Print the confusion matrix for Logistic Regression
print("Logistic Regression Confusion Matrix\n")
# Create a Pandas DataFrame from the confusion matrix for better readability
# Columns are labeled from 0 to 9 (representing the digits)
pd.DataFrame(confusion_matrix(y_test,y_pred1),columns=list(range(0,10)))

Logistic Regression Confusion Matrix



Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,791,0,1,0,1,9,11,1,6,1
1,0,938,2,4,0,3,0,3,12,0
2,2,9,745,16,9,4,13,13,15,3
3,1,4,19,763,0,36,5,5,20,11
4,2,5,4,0,804,1,7,3,9,21
5,9,3,4,23,10,610,15,10,34,11
6,12,4,12,1,7,8,789,0,5,1
7,3,2,13,6,4,2,0,812,2,29
8,11,12,13,18,1,21,6,6,694,11
9,3,6,2,14,25,4,0,37,7,736


In [12]:
# Print the confusion matrix for Decision Tree
print("Decision Tree Confusion Matrix\n")
# Create a Pandas DataFrame from the confusion matrix for better readability
# Columns are labeled from 0 to 9 (representing the digits)
pd.DataFrame(confusion_matrix(y_test,y_pred2),columns=list(range(0,10)))

Decision Tree Confusion Matrix



Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,758,0,9,5,8,16,7,3,11,4
1,1,905,8,11,4,5,4,10,13,1
2,12,14,665,34,16,16,19,23,21,9
3,8,11,29,699,7,45,8,14,26,17
4,10,1,10,9,728,14,10,10,17,47
5,11,9,13,41,7,581,18,8,23,18
6,9,1,10,5,11,25,759,0,13,6
7,2,9,27,11,10,3,1,792,6,12
8,11,11,23,39,16,41,10,7,607,28
9,5,6,9,15,43,21,4,27,23,681


In [13]:
# Import precision_score, recall_score, and f1_score from sklearn.metrics
from sklearn.metrics import precision_score,recall_score,f1_score

In [14]:
# Calculate the weighted precision score for Logistic Regression
precision_score(y_test,y_pred1,average='weighted')

0.9142150713630827

In [15]:
# Calculate the weighted recall score for Logistic Regression
recall_score(y_test,y_pred1,average='weighted')

0.9145238095238095

In [16]:
# Calculate the weighted F1 score for Logistic Regression
f1_score(y_test,y_pred1,average='weighted')

0.9142794994052751

In [None]:
# Import the classification_report function from scikit-learn
from sklearn.metrics import classification_report

# Print a comprehensive classification report for Logistic Regression
# Includes precision, recall, F1-score, and support for each class
print(classification_report(y_test,y_pred1))

              precision    recall  f1-score   support

           0       0.95      0.96      0.96       821
           1       0.95      0.98      0.96       962
           2       0.91      0.90      0.91       829
           3       0.90      0.88      0.89       864
           4       0.93      0.94      0.94       856
           5       0.87      0.84      0.85       729
           6       0.93      0.94      0.94       839
           7       0.91      0.93      0.92       873
           8       0.86      0.88      0.87       793
           9       0.89      0.88      0.89       834

    accuracy                           0.91      8400
   macro avg       0.91      0.91      0.91      8400
weighted avg       0.91      0.91      0.91      8400

