In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE,RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix
from itertools import combinations
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis


In [14]:
# CSV file
file_path = 'data/iith_foml_2023_train.csv'
data = pd.read_csv(file_path)

In [15]:
# Assuming the target variable column is named "Target Variable (Discrete)"
X = data.drop(columns=['Target Variable (Discrete)','Feature 21 (Discrete)','Feature 22 (Discrete)','Feature 23 (Discrete)','Feature 24'])
y = data['Target Variable (Discrete)']

# Impute missing values using SimpleImputer
imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Standardize the data
scaler = MinMaxScaler(feature_range=(-100, 100))
X_imputed = pd.DataFrame(scaler.fit_transform(X_imputed), columns=X_imputed.columns)

In [16]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda7 = LinearDiscriminantAnalysis(n_components=7)
lda8 = LinearDiscriminantAnalysis(n_components=8)

In [17]:
lda7.fit(X_imputed, y)
lda8.fit(X_imputed, y)

In [18]:
X_lda7 = lda7.transform(X_imputed)
X_lda8 = lda8.transform(X_imputed)

In [19]:
kNN_Model7 = KNeighborsClassifier(n_neighbors=1)
kNN_Model8 = KNeighborsClassifier(n_neighbors=1)

In [20]:
kNN_Model7.fit(X_lda7, y)
kNN_Model8.fit(X_lda8, y)

In [21]:
# Load test data
test_data = pd.read_csv('data/iith_foml_2023_test.csv')
test_data = test_data.drop(columns=['Feature 21 (Discrete)','Feature 22 (Discrete)','Feature 23 (Discrete)','Feature 24'])
test_data = pd.DataFrame(imputer.transform(test_data), columns=test_data.columns)
test_data = pd.DataFrame(scaler.transform(test_data), columns=test_data.columns)
test_data7 = lda7.transform(test_data)
test_data8 = lda8.transform(test_data)

# Make predictions on the test data
predictions7 = kNN_Model7.predict(test_data7)
predictions8 = kNN_Model8.predict(test_data8)

# Create a DataFrame for results with ID and Predicted Category columns
results_df7 = pd.DataFrame({
    'ID': np.arange(1, len(predictions7) + 1),  # Assuming IDs start from 1
    'Category': predictions7
})

# Create a DataFrame for results with ID and Predicted Category columns
results_df8 = pd.DataFrame({
    'ID': np.arange(1, len(predictions8) + 1),  # Assuming IDs start from 1
    'Category': predictions8
})

# # Write the results to a CSV file
# results_df8.to_csv('output_results.csv', index=False)



In [22]:
results_df8.rename(columns = {'Category':'curr'}, inplace = True)
results_df7.rename(columns = {'Category':'prev'}, inplace = True)

# Merge the two DataFrames on the index (assuming the rows are in the same order)
merged_df = pd.concat([results_df7,results_df8['curr']], axis=1)

# Display rows where the predictions differ
differing_predictions = merged_df[merged_df['curr'] - merged_df['prev'] != 0]

In [23]:
# List of IDs to filter
target_ids = [0, 1, 2, 5, 6]

vara = differing_predictions[differing_predictions['prev'].isin(target_ids)]
for id in vara['ID']:
    results_df7['prev'][id-1] = results_df8['curr'][id-1]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df7['prev'][id-1] = results_df8['curr'][id-1]


In [24]:
results_df7.rename(columns = {'prev':'Category'}, inplace = True)
results_df7.to_csv('output_results.csv', index=False)