In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load the dataset from a CSV file
df = pd.read_csv('DS project2.csv')
# Data Cleaning: Handle missing values and rename columns
df.rename(columns={'Category Rank': 'Rank', 'Enter Your Name': 'Name', 'Preferred  Branch at IIITDM': 'Branch', 'Select your state name': 'State', 'Did you got the seat?': 'Output'}, inplace=True)
print(df.shape)
# Fill missing values in 'Rank' based on the median of each subgroup
df['Rank'] = df.groupby(['Branch', 'Gender', 'Category'])['Rank'].transform(lambda x: x.fillna(x.median()))
df['Rank'].fillna(df['Rank'].median(), inplace=True)
df.drop_duplicates(subset=['Name'], keep='first', inplace=True)
df.drop_duplicates(subset=['Rank'], keep='first', inplace=True)
df = df.drop(['Timestamp'], axis=1)
print(df.shape)
# Data Transformation: Encode categorical attributes
label_encoder = LabelEncoder()
df['Category'] = label_encoder.fit_transform(df['Category'])
df['Gender'] = label_encoder.fit_transform(df['Gender'])
df['Branch'] = label_encoder.fit_transform(df['Branch'])
df['State'] = label_encoder.fit_transform(df['State'])
df['Output'] = label_encoder.fit_transform(df['Output'])

# Remove outliers based on Rank
Q1 = df['Rank'].quantile(0.25)
Q3 = df['Rank'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['Rank'] >= lower_bound) & (df['Rank'] <= upper_bound)]


# Split the data into features (X) and the target variable (y)
X = df[['Branch', 'Rank', 'Gender', 'State', 'Category']]
y = df['Output']

# Perform PCA for dimensionality reduction manually
# Subtract the mean from each feature
mean = np.mean(X, axis=0)
X_centered = X - mean

# Calculate the covariance matrix
cov_matrix = np.cov(X_centered, rowvar=False)

# Calculate the eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)

# Sort eigenvalues and corresponding eigenvectors in descending order
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:, sorted_indices]

# Choose the number of components
n_components = 5
top_eigenvectors = eigenvectors[:, :n_components]

# Project the data onto the top principal components
X_pca = np.dot(X_centered, top_eigenvectors)

# Perform feature scaling manually (Min-Max scaling - Normalization)
min_vals = np.min(X_pca, axis=0)
max_vals = np.max(X_pca, axis=0)
X_normalized = (X_pca - min_vals) / (max_vals - min_vals)

# Train the Random Forest Classifier
clf = RandomForestClassifier()
clf.fit(X_normalized, y)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
correlation_matrix = df.iloc[:, 1:].corr()
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap for Multiple Variables")
plt.show()

In [None]:
# User input for prediction
name = input('Enter your Name: ')
branch = int(input('Enter your Preferred Branch number:\n0 for CSE\n1 for CSEAI\n2 for ECE\n3 for ME\n'))
rank = int(input('Enter your Category Rank: '))
gender = int(input('Enter Gender number:\n1 for Male\n0 for Female\n'))
location = int(input('Enter number of your state:\n0 for Andhra Pradesh\n1 for Bihar\n3 for Gujarat\n4 for Haryana\n5 for Himachal Pradesh\n6 for Karnataka\n7 for Kerala\n8 for Madhya Pradesh\n9 for Maharashtra\n10 for Odisha\n11 for Punjab\n12 for Rajasthan\n13 for Tamil Nadu\n14 for Telangana\n15 for Uttar Pradesh\n16 for West Bengal\n'))
category = int(input('Enter category number:\n0 for EWS\n1 for General\n2 for OBC\n3 for PwD\n4 for SC\n5 for ST\n'))

# Prepare the user's input data for prediction (excluding 'Name')
user_data = {
    'Branch': [branch],
    'Rank': [rank],
    'Gender': [gender],
    'State': [location],
    'Category': [category],
}

user_data_df = pd.DataFrame(user_data)

# Project user's data onto the top principal components
user_data_centered = user_data_df - mean
user_data_pca = np.dot(user_data_centered, top_eigenvectors)

# Perform feature scaling for user data manually
user_data_normalized = (user_data_pca - min_vals) / (max_vals - min_vals)

# Use the trained Random Forest model to predict on the user's data
user_prediction = clf.predict(user_data_normalized)

if user_prediction[0] == 1:
    print("Congratulations! You can join the college.")
else:
    print("Sorry, you cannot join the college.")