## Importing all the necessary libraies 

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib
from xgboost import XGBClassifier

## Loading, cleaning and preparing the dataset

In [7]:
data = pd.read_excel("train.xlsx")

In [8]:
data.head()


Unnamed: 0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11,T12,T13,T14,T15,T16,T17,T18,target
0,-70,-61,-66,-53,-51,-63,-82,-57,-76,-78,-66,-66,-61,-59,-73,-75,-63,-77,B37
1,-77,-74,-71,-76,-65,-63,-66,-52,-55,-75,-72,-75,-74,-61,-64,-63,-53,-63,B61
2,-53,-38,-55,-66,-62,-62,-65,-70,-62,-52,-56,-53,-66,-68,-72,-60,-68,-77,A19
3,-72,-62,-59,-65,-65,-65,-78,-82,-83,-59,-84,-60,-64,-83,-69,-72,-95,-73,A22
4,-67,-69,-65,-63,-59,-53,-70,-72,-71,-60,-61,-57,-54,-76,-61,-66,-71,-80,A33


In [9]:
data = data.dropna()

In [10]:
data["target"].value_counts()

target
A39    635
A33    633
A29    432
A3     429
B45    428
      ... 
A27    191
B65    191
A69    190
A18    189
A9     188
Name: count, Length: 160, dtype: int64

In [11]:
# Converting the target columns in to a categorical form
data['target'] = data['target'].astype('category').cat.codes

In [12]:
data['target'].value_counts()

target
32     635
26     633
21     432
22     429
119    428
      ... 
19     191
141    191
65     190
9      189
79     188
Name: count, Length: 160, dtype: int64

In [13]:
X= data.drop('target',axis=1)
y = data['target']

## Splitting the dataset

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [15]:
X_train

Unnamed: 0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11,T12,T13,T14,T15,T16,T17,T18
23860,-50,-52,-65,-54,-58,-57,-63,-65,-55,-46,-56,-61,-68,-56,-67,-59,-80,-66
22160,-60,-56,-44,-66,-75,-58,-74,-68,-69,-55,-53,-44,-57,-72,-66,-72,-74,-70
30056,-72,-70,-68,-68,-72,-65,-64,-69,-65,-63,-63,-63,-62,-57,-49,-54,-60,-58
32358,-75,-69,-76,-75,-61,-75,-63,-58,-51,-85,-76,-82,-74,-64,-66,-59,-66,-64
10026,-65,-64,-55,-46,-54,-61,-69,-55,-66,-60,-73,-62,-62,-73,-56,-55,-66,-59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36067,-73,-82,-67,-70,-70,-75,-66,-62,-62,-95,-71,-95,-68,-64,-67,-62,-69,-45
24931,-76,-62,-63,-65,-49,-59,-67,-65,-71,-72,-62,-60,-62,-61,-59,-60,-64,-66
31521,-64,-61,-56,-47,-53,-59,-71,-55,-66,-59,-71,-62,-59,-77,-57,-55,-66,-58
30100,-63,-69,-60,-54,-64,-53,-79,-68,-59,-57,-71,-56,-57,-72,-61,-67,-61,-62


In [16]:
X_test

Unnamed: 0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11,T12,T13,T14,T15,T16,T17,T18
22046,-55,-47,-55,-65,-66,-65,-63,-73,-65,-44,-53,-61,-59,-64,-62,-73,-65,-66
1677,-56,-56,-76,-78,-71,-75,-80,-79,-81,-50,-55,-58,-67,-75,-80,-82,-72,-81
13752,-74,-81,-74,-71,-64,-62,-63,-52,-51,-75,-76,-74,-76,-61,-65,-62,-52,-60
16924,-65,-69,-64,-67,-66,-54,-61,-51,-45,-77,-64,-62,-67,-61,-58,-54,-48,-51
5974,-82,-79,-68,-74,-73,-72,-57,-57,-54,-84,-81,-80,-68,-64,-64,-72,-59,-53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31609,-63,-69,-58,-56,-63,-53,-73,-69,-63,-59,-69,-56,-57,-73,-61,-65,-69,-63
17079,-63,-79,-61,-59,-68,-56,-65,-70,-74,-63,-56,-57,-58,-75,-64,-62,-68,-76
20199,-75,-71,-67,-70,-77,-60,-70,-67,-54,-74,-68,-80,-68,-62,-61,-62,-69,-69
24550,-63,-62,-55,-45,-55,-61,-73,-55,-65,-59,-72,-61,-59,-72,-56,-55,-65,-58


## Scalling the dataset

In [17]:
scaller = StandardScaler()
X_train = scaller.fit_transform(X_train)
X_test = scaller.transform(X_test)

## Training a model using Random Forest Classifier

In [59]:
model = RandomForestClassifier()

In [60]:
model.fit(X_train,y_train)

In [61]:
y_pred = model.predict(X_test)

In [62]:
accuracy_score(y_test,y_pred)

0.9873486600462522

In [65]:
joblib.dump(model, "task1rf.pkl")

['task1rf.pkl']

## Training a model using XG Boost Classifier

In [67]:
xgb_model = XGBClassifier()
xgb_model.fit(X_train,y_train)

In [68]:
y_pred = xgb_model.predict(X_test)

In [70]:
accuracy_score(y_test, y_pred)

0.9798666848047884

In [72]:
joblib.dump(xgb_model, "task1XGB.pkl")

['task1XGB.pkl']

## Testing the model using Test file

In [14]:
test = pd.read_excel("test.xlsx")

In [15]:
test.head()

Unnamed: 0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11,T12,T13,T14,T15,T16,T17,T18
0,-76,-83,-70,-66,-64,-72,-64,-69,-60,-76,-83,-78,-81,-81,-81,-70,-60,-60
1,-58,-57,-78,-81,-73,-73,-78,-78,-82,-49,-55,-58,-66,-79,-72,-83,-74,-80
2,-70,-70,-71,-69,-69,-68,-61,-55,-53,-82,-87,-76,-68,-57,-64,-75,-57,-70
3,-71,-61,-56,-56,-61,-60,-68,-66,-72,-58,-55,-56,-58,-62,-61,-59,-64,-65
4,-72,-71,-64,-69,-64,-63,-61,-42,-55,-61,-69,-67,-63,-63,-55,-49,-49,-57


In [16]:
test = scaller.transform(test)

## Testing with Random Forest

In [17]:
modelrf = joblib.load("task1rf.pkl")

In [41]:
rftarget = modelrf.predict(test)

In [42]:
target_labels = data['target'].astype('category').cat.categories
rftarget_decoded = [target_labels[idx] for idx in rftarget]

In [43]:
rftarget_df = pd.DataFrame(rftarget_decoded, columns=['Predicted_Target'])

In [44]:
print(rftarget_df.head())


  Predicted_Target
0              B74
1              A10
2              B65
3              B20
4              A67


In [45]:
rftarget_df.to_csv("rftarget.csv")

## Testing with XG BOOST

In [46]:
modelXGB = joblib.load("task1XGB.pkl")

In [47]:
XGBtarget = modelXGB.predict(test)

In [48]:
target_labels = data['target'].astype('category').cat.categories

In [49]:
XGBtarget_decoded = [target_labels[idx] for idx in XGBtarget]

In [50]:
XGBtarget_df = pd.DataFrame(XGBtarget_decoded, columns=['Predicted_Target'])

In [51]:
print(XGBtarget_df.head())
XGBtarget_df.to_csv("XGBtarget.csv")

  Predicted_Target
0              B74
1              A10
2              B65
3              B20
4              A67


In [18]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.9759216433138349


In [None]:
new_data_scaled = scaller.transform(new_data)  # Scale the new data
predictions = model.predict(new_data_scaled)

In [19]:
from sklearn.svm import SVC
svm_model = SVC(kernel='rbf')
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)



Accuracy: 0.9874846959597333
