### ***Importing Libraries***

In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

### ***Loading the train and test data***

In [2]:
train_data_path = 'iith_foml_2023_train.csv'
data = pd.read_csv(train_data_path)

test_data_path = 'test_input.csv'
test_data = pd.read_csv(test_data_path)

### ***Separating into target and predictor variable. Only First 20 Features are considered***
### ***Imputing empty values by median***
### ***Using MinMax Scaler for scaling of features***

In [3]:
X = data.drop(columns=['Target Variable (Discrete)','Feature 21 (Discrete)','Feature 22 (Discrete)','Feature 23 (Discrete)','Feature 24'])
y = data['Target Variable (Discrete)']

# Impute missing values using SimpleImputer
imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Standardize the data
scaler = MinMaxScaler(feature_range=(-100, 100))
X_imputed = pd.DataFrame(scaler.fit_transform(X_imputed), columns=X_imputed.columns)

### ***Applying Linear Discriminant Analysis to separate out classes***

In [4]:
lda7 = LinearDiscriminantAnalysis(n_components=7)
lda8 = LinearDiscriminantAnalysis()

In [5]:
lda7.fit(X_imputed, y)
lda8.fit(X_imputed, y)

In [6]:
X_lda7 = lda7.transform(X_imputed)
X_lda8 = lda8.transform(X_imputed)

### ***kNN Model with k=1***

In [7]:
kNN_Model7 = KNeighborsClassifier(n_neighbors=1)
kNN_Model8 = KNeighborsClassifier(n_neighbors=1)

In [8]:
kNN_Model7.fit(X_lda7, y)
kNN_Model8.fit(X_lda8, y)

### ***Predicting on test data***

In [9]:
test_data = test_data.drop(columns=['Feature 21 (Discrete)','Feature 22 (Discrete)','Feature 23 (Discrete)','Feature 24'])
test_data = pd.DataFrame(imputer.transform(test_data), columns=test_data.columns)
test_data = pd.DataFrame(scaler.transform(test_data), columns=test_data.columns)
test_data7 = lda7.transform(test_data)
test_data8 = lda8.transform(test_data)

predictions7 = kNN_Model7.predict(test_data7)
predictions8 = kNN_Model8.predict(test_data8)

results_df7 = pd.DataFrame({
    'ID': np.arange(1, len(predictions7) + 1),  # Assuming IDs start from 1
    'Category': predictions7
})

results_df8 = pd.DataFrame({
    'ID': np.arange(1, len(predictions8) + 1),  # Assuming IDs start from 1
    'Category': predictions8
})

### ***Ensembling two trained models.***
### ***If any of the models are predicting minority classes then it is considered to be in minority class because most of the minority classes is going into majority class as I have seen while trying different models.***

In [10]:
results_df8.rename(columns = {'Category':'curr'}, inplace = True)
results_df7.rename(columns = {'Category':'prev'}, inplace = True)

merged_df = pd.concat([results_df7,results_df8['curr']], axis=1)

differing_predictions = merged_df[merged_df['curr'] - merged_df['prev'] != 0]

In [11]:
target_ids = [0, 1, 2, 5, 6]

vara = differing_predictions[differing_predictions['prev'].isin(target_ids)]
for id in vara['ID']:
    results_df7['prev'][id-1] = results_df8['curr'][id-1]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df7['prev'][id-1] = results_df8['curr'][id-1]


### ***Saving the prediction file***

In [12]:
results_df7.rename(columns = {'prev':'Category'}, inplace = True)
results_df7.to_csv('test_output.csv', index=False)