In [22]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix, silhouette_score

df = pd.read_csv("C:/Users/admin/Downloads/FirstStepFiles/ResearchInformation3.csv")

# 1) Simple mapping -> Track (supervised setup)
dept_to_track = {
    "Business Administration": "ABM",
    "Computer Science and Engineering": "STEM",
    "Economics": "ABM",
    "Electrical and Electronic Engineering": "STEM",
    "English": "HUMSS",
    "Journalism, Communication and Media Studies": "HUMSS",
    "Law and Human Rights": "HUMSS",
    "Political Science": "HUMSS",
    "Public Health": "STEM",
    "Sociology": "HUMSS"
}
df['Track'] = df['Department'].map(dept_to_track)

print(df['Track'])



0        ABM
1        ABM
2        ABM
3        ABM
4        ABM
       ...  
488     STEM
489     STEM
490    HUMSS
491    HUMSS
492    HUMSS
Name: Track, Length: 493, dtype: object


In [23]:
# 2) Feature engineering
for c in ['HSC','SSC','Overall','Last','English','Computer']:
    df[c] = pd.to_numeric(df[c], errors='coerce')

# Flags
df['Extra_flag'] = df['Extra'].astype(str).str.lower().map({'yes':1, 'no':0}).fillna(0)
# Attendance flag example:
df['Attendance_flag'] = df['Attendance'].str.contains('80%-100%', na=False).astype(int)

# fill numeric missing values with median
num_cols = ['HSC','SSC','Overall','Last','English','Computer']
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# scaled features
scaler = StandardScaler()
X = scaler.fit_transform(df[num_cols + ['Extra_flag','Attendance_flag']])

# 3a) Unsupervised: clustering
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X)
k = 4  # try different k and evaluate silhouette
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(X_pca)
df['cluster'] = labels
print("Silhouette:", silhouette_score(X_pca, labels))

# Inspect cluster composition by Department:
print(df.groupby('cluster')['Department'].value_counts().head(20))

# 3b) Supervised: predict Track if Track exists
train_df = df.dropna(subset=['Track']).copy()
y = train_df['Track']
X_super = scaler.fit_transform(train_df[num_cols + ['Extra_flag','Attendance_flag']])
X_train, X_test, y_train, y_test = train_test_split(X_super, y, test_size=0.2, random_state=42, stratify=y)
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Silhouette: 0.23048564541020217
cluster  Department                                 
0        Computer Science and Engineering                49
         Journalism, Communication and Media Studies      8
         English                                          2
         Political Science                                2
         Public Health                                    2
         Business Administration                          1
         Electrical and Electronic Engineering            1
         Law and Human Rights                             1
         Sociology                                        1
1        Computer Science and Engineering               159
         English                                         11
         Business Administration                          4
         Economics                                        4
         Political Science                                4
         Journalism, Communication and Media Studies      3
         Sociol

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
stem_df = df[df['Track'] == 'STEM']
print(stem_df['Track'])
print("STEM count:", stem_df.shape[0])

5      STEM
6      STEM
7      STEM
8      STEM
9      STEM
       ... 
446    STEM
447    STEM
452    STEM
488    STEM
489    STEM
Name: Track, Length: 446, dtype: object
STEM count: 446


In [25]:
print(df.head())

                Department  Gender   HSC   SSC                        Income  \
0  Business Administration    Male  4.17  4.84            Low (Below 15,000)   
1  Business Administration  Female  4.92  5.00  Upper middle (30,000-50,000)   
2  Business Administration    Male  5.00  4.83  Lower middle (15,000-30,000)   
3  Business Administration    Male  4.00  4.50           High (Above 50,000)   
4  Business Administration  Female  2.19  3.17  Lower middle (15,000-30,000)   

  Hometown  Computer        Preparation             Gaming Attendance Job  \
0  Village         3  More than 3 Hours           0-1 Hour   80%-100%  No   
1     City         3           0-1 Hour           0-1 Hour   80%-100%  No   
2  Village         3           0-1 Hour  More than 3 Hours   80%-100%  No   
3     City         5  More than 3 Hours  More than 3 Hours   80%-100%  No   
4  Village         3           0-1 Hour          2-3 Hours   80%-100%  No   

   English Extra Semester   Last  Overall Track  Extra_f

In [26]:

it_list = ["Computer Science", "Information Technology", "Computer Science and Engineering"]
df['Interest_in_IT'] = df['Department'].fillna('').str.strip().isin(it_list)
ex_cls = ['Attendance', 'Income', 'Hometown', 'Computer', 'English', 'Overall', 'Last', 'SSC', 'HSC', 'Extra', 'Gaming', 'Preparation', 'Gender', 'Job', 'Semester']
print(df.drop(columns=ex_cls))
print(df['Interest_in_IT'].count())

                  Department  Track  Extra_flag  Attendance_flag  cluster  \
0    Business Administration    ABM           1                1        1   
1    Business Administration    ABM           1                1        1   
2    Business Administration    ABM           1                1        1   
3    Business Administration    ABM           1                1        1   
4    Business Administration    ABM           1                1        0   
..                       ...    ...         ...              ...      ...   
488            Public Health   STEM           1                0        0   
489            Public Health   STEM           1                0        0   
490                Sociology  HUMSS           0                0        0   
491                Sociology  HUMSS           1                1        1   
492                Sociology  HUMSS           1                1        1   

     Interest_in_IT  
0             False  
1             False  
2        

In [27]:
print(df['Department'].unique())

['Business Administration' 'Computer Science and Engineering' 'Economics'
 'Electrical and Electronic Engineering' 'English'
 'Journalism, Communication and Media Studies' 'Law and Human Rights'
 'Political Science' 'Public Health' 'Sociology']


In [28]:
print(df['Track'].unique())

['ABM' 'STEM' 'HUMSS']


In [33]:
df.to_csv('processed_interest_data.csv', index=False)
print("✅ Processed data saved!")


✅ Processed data saved!


In [None]:
from sklearn.ensemble import RandomForestClassifier
X = df[['Interest_in_IT']]
y = df['Track']

rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)



['STEM']


