# **Campus Placement Data Analysis and Machine Learning Project.**


This project involves analyzing campus placement data to uncover trends and factors influencing student placement outcomes. Using machine learning models, the project predicts the likelihood of placement for students based on their academic performance, demographic attributes, and extracurricular involvement, helping institutions and students make data-driven decisions for improved placement strategies.

In [2]:
# loading useful libraries 
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")


In [3]:
# Loading the dataset for EDA defined as df 
df = pd.read_csv('train.csv')
df

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,0,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,No,55.0,Mkt&HR,58.80,Placed,270000.0
1,2,0,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,0,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,No,75.0,Mkt&Fin,57.80,Placed,250000.0
3,4,0,56.00,Central,52.00,Central,Science,52.00,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,0,85.80,Central,73.60,Central,Commerce,73.30,Comm&Mgmt,No,96.8,Mkt&Fin,55.50,Placed,425000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,211,0,80.60,Others,82.00,Others,Commerce,77.60,Comm&Mgmt,No,91.0,Mkt&Fin,74.49,Placed,400000.0
211,212,0,58.00,Others,60.00,Others,Science,72.00,Sci&Tech,No,74.0,Mkt&Fin,53.62,Placed,275000.0
212,213,0,67.00,Others,67.00,Others,Commerce,73.00,Comm&Mgmt,Yes,59.0,Mkt&Fin,69.72,Placed,295000.0
213,214,1,74.00,Others,66.00,Others,Commerce,58.00,Comm&Mgmt,No,70.0,Mkt&HR,60.23,Placed,204000.0


In [4]:
# Exploring dataset in multiple steps 
df.head(10)

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,0,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,0,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,0,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,0,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,0,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0
5,6,0,55.0,Others,49.8,Others,Science,67.25,Sci&Tech,Yes,55.0,Mkt&Fin,51.58,Not Placed,
6,7,1,46.0,Others,49.2,Others,Commerce,79.0,Comm&Mgmt,No,74.28,Mkt&Fin,53.29,Not Placed,
7,8,0,82.0,Central,64.0,Central,Science,66.0,Sci&Tech,Yes,67.0,Mkt&Fin,62.14,Placed,252000.0
8,9,0,73.0,Central,79.0,Central,Commerce,72.0,Comm&Mgmt,No,91.34,Mkt&Fin,61.29,Placed,231000.0
9,10,0,58.0,Central,70.0,Central,Commerce,61.0,Comm&Mgmt,No,54.0,Mkt&Fin,52.21,Not Placed,


In [5]:
df.duplicated().any()


False

In [6]:
df.describe(include='all').transpose()


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
sl_no,215.0,,,,108.0,62.209324,1.0,54.5,108.0,161.5,215.0
gender,215.0,,,,0.353488,0.479168,0.0,0.0,0.0,1.0,1.0
ssc_p,215.0,,,,67.303395,10.827205,40.89,60.6,67.0,75.7,89.4
ssc_b,215.0,2.0,Central,116.0,,,,,,,
hsc_p,215.0,,,,66.333163,10.897509,37.0,60.9,65.0,73.0,97.7
hsc_b,215.0,2.0,Others,131.0,,,,,,,
hsc_s,215.0,3.0,Commerce,113.0,,,,,,,
degree_p,215.0,,,,66.370186,7.358743,50.0,61.0,66.0,72.0,91.0
degree_t,215.0,3.0,Comm&Mgmt,145.0,,,,,,,
workex,215.0,2.0,No,141.0,,,,,,,


In [7]:
# Checking fo null values in  Dataset 
df.isnull().sum()

sl_no              0
gender             0
ssc_p              0
ssc_b              0
hsc_p              0
hsc_b              0
hsc_s              0
degree_p           0
degree_t           0
workex             0
etest_p            0
specialisation     0
mba_p              0
status             0
salary            67
dtype: int64

In [8]:
df.nunique()


sl_no             215
gender              2
ssc_p             103
ssc_b               2
hsc_p              97
hsc_b               2
hsc_s               3
degree_p           89
degree_t            3
workex              2
etest_p           100
specialisation      2
mba_p             205
status              2
salary             45
dtype: int64

In [9]:
# Checking for unique values in categorical variables 
df['ssc_b'].unique()


array(['Others', 'Central'], dtype=object)

In [10]:
df['specialisation'].unique()

array(['Mkt&HR', 'Mkt&Fin'], dtype=object)

In [11]:
df['status'].unique()

array(['Placed', 'Not Placed'], dtype=object)

In [12]:
df

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,0,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,No,55.0,Mkt&HR,58.80,Placed,270000.0
1,2,0,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,0,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,No,75.0,Mkt&Fin,57.80,Placed,250000.0
3,4,0,56.00,Central,52.00,Central,Science,52.00,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,0,85.80,Central,73.60,Central,Commerce,73.30,Comm&Mgmt,No,96.8,Mkt&Fin,55.50,Placed,425000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,211,0,80.60,Others,82.00,Others,Commerce,77.60,Comm&Mgmt,No,91.0,Mkt&Fin,74.49,Placed,400000.0
211,212,0,58.00,Others,60.00,Others,Science,72.00,Sci&Tech,No,74.0,Mkt&Fin,53.62,Placed,275000.0
212,213,0,67.00,Others,67.00,Others,Commerce,73.00,Comm&Mgmt,Yes,59.0,Mkt&Fin,69.72,Placed,295000.0
213,214,1,74.00,Others,66.00,Others,Commerce,58.00,Comm&Mgmt,No,70.0,Mkt&HR,60.23,Placed,204000.0


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sl_no           215 non-null    int64  
 1   gender          215 non-null    int64  
 2   ssc_p           215 non-null    float64
 3   ssc_b           215 non-null    object 
 4   hsc_p           215 non-null    float64
 5   hsc_b           215 non-null    object 
 6   hsc_s           215 non-null    object 
 7   degree_p        215 non-null    float64
 8   degree_t        215 non-null    object 
 9   workex          215 non-null    object 
 10  etest_p         215 non-null    float64
 11  specialisation  215 non-null    object 
 12  mba_p           215 non-null    float64
 13  status          215 non-null    object 
 14  salary          148 non-null    float64
dtypes: float64(6), int64(2), object(7)
memory usage: 25.3+ KB


# Feature Engineering 
Converting the variables with Float64 data type to int64 



In [14]:
df.isnull().sum()

sl_no              0
gender             0
ssc_p              0
ssc_b              0
hsc_p              0
hsc_b              0
hsc_s              0
degree_p           0
degree_t           0
workex             0
etest_p            0
specialisation     0
mba_p              0
status             0
salary            67
dtype: int64

In [16]:
# Columns to extract
columns_to_extract = ['gender','ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p', 'status']

# Create a new DataFrame with the selected columns
df2 = df[columns_to_extract]

# Display the new DataFrame
print(df2.head())


   gender  ssc_p  hsc_p  degree_p  etest_p  mba_p      status
0       0  67.00  91.00     58.00     55.0  58.80      Placed
1       0  79.33  78.33     77.48     86.5  66.28      Placed
2       0  65.00  68.00     64.00     75.0  57.80      Placed
3       0  56.00  52.00     52.00     66.0  59.43  Not Placed
4       0  85.80  73.60     73.30     96.8  55.50      Placed


In [17]:
def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Identify rows with outliers
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

# Example: Detect outliers for a single variable
outliers_hsc_p = detect_outliers_iqr(df, 'hsc_p')
print(f"Number of outliers in 'hsc_p': {len(outliers_hsc_p)}")

Number of outliers in 'hsc_p': 8


In [18]:
outliers_hsc_p

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
24,25,0,76.5,Others,97.7,Others,Science,78.86,Sci&Tech,No,97.4,Mkt&Fin,74.01,Placed,360000.0
42,43,0,49.0,Others,39.0,Central,Science,65.0,Others,No,63.0,Mkt&Fin,51.21,Not Placed,
49,50,1,50.0,Others,37.0,Others,Arts,52.0,Others,No,65.0,Mkt&HR,56.11,Not Placed,
120,121,0,58.0,Others,40.0,Others,Science,59.0,Comm&Mgmt,No,73.0,Mkt&HR,58.81,Not Placed,
134,135,1,77.44,Central,92.0,Others,Commerce,72.0,Comm&Mgmt,Yes,94.0,Mkt&Fin,67.13,Placed,250000.0
169,170,0,59.96,Others,42.16,Others,Science,61.26,Sci&Tech,No,54.48,Mkt&HR,65.48,Not Placed,
177,178,1,73.0,Central,97.0,Others,Commerce,79.0,Comm&Mgmt,Yes,89.0,Mkt&Fin,70.81,Placed,650000.0
206,207,0,41.0,Central,42.0,Central,Science,60.0,Comm&Mgmt,No,97.0,Mkt&Fin,53.39,Not Placed,


In [19]:
def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Identify rows with outliers
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

# Example: Detect outliers for a single variable
outliers_degree_p = detect_outliers_iqr(df, 'degree_p')
print(f"Number of outliers in 'degree_p': {len(outliers_degree_p)}")

Number of outliers in 'degree_p': 1


In [20]:
outliers_degree_p

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
197,198,1,83.96,Others,53.0,Others,Science,91.0,Sci&Tech,No,59.32,Mkt&HR,69.71,Placed,260000.0


In [21]:
# Calculate IQR
Q1 = df2['hsc_p'].quantile(0.25)
Q3 = df2['hsc_p'].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove outliers
df3 = df2[(df2['hsc_p'] >= lower_bound) & (df2['hsc_p'] <= upper_bound)]

# Check the result
df3


Unnamed: 0,gender,ssc_p,hsc_p,degree_p,etest_p,mba_p,status
0,0,67.00,91.00,58.00,55.0,58.80,Placed
1,0,79.33,78.33,77.48,86.5,66.28,Placed
2,0,65.00,68.00,64.00,75.0,57.80,Placed
3,0,56.00,52.00,52.00,66.0,59.43,Not Placed
4,0,85.80,73.60,73.30,96.8,55.50,Placed
...,...,...,...,...,...,...,...
210,0,80.60,82.00,77.60,91.0,74.49,Placed
211,0,58.00,60.00,72.00,74.0,53.62,Placed
212,0,67.00,67.00,73.00,59.0,69.72,Placed
213,1,74.00,66.00,58.00,70.0,60.23,Placed


In [22]:
# Calculate IQR
Q1 = df3['degree_p'].quantile(0.25)
Q3 = df3['degree_p'].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove outliers
df4 = df3[(df3['degree_p'] >= lower_bound) & (df3['degree_p'] <= upper_bound)]

# Check the result
df4


Unnamed: 0,gender,ssc_p,hsc_p,degree_p,etest_p,mba_p,status
0,0,67.00,91.00,58.00,55.0,58.80,Placed
1,0,79.33,78.33,77.48,86.5,66.28,Placed
2,0,65.00,68.00,64.00,75.0,57.80,Placed
3,0,56.00,52.00,52.00,66.0,59.43,Not Placed
4,0,85.80,73.60,73.30,96.8,55.50,Placed
...,...,...,...,...,...,...,...
210,0,80.60,82.00,77.60,91.0,74.49,Placed
211,0,58.00,60.00,72.00,74.0,53.62,Placed
212,0,67.00,67.00,73.00,59.0,69.72,Placed
213,1,74.00,66.00,58.00,70.0,60.23,Placed


In [23]:
# Save df4 as a CSV file
df4.to_csv('df4.csv', index=False)

# Confirmation message
print("df4 has been saved as 'df4.csv'")


df4 has been saved as 'df4.csv'


In [24]:

from sklearn.model_selection import train_test_split

# Features and Target
selected_features = ['gender', 'ssc_p',  'hsc_p', 'degree_p', 'etest_p', 'mba_p']  # Selected features
target_column = 'status'

# Filter DataFrame for Selected Features and Target
X = df4[selected_features]
y = df4[target_column]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)




In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Train-test split (assuming X and y are already defined)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Feature Importance
feature_importance = pd.Series(rf_model.feature_importances_, index=X.columns)
print("\nFeature Importance:\n", feature_importance.sort_values(ascending=False))


Accuracy: 0.9047619047619048

Classification Report:
               precision    recall  f1-score   support

  Not Placed       1.00      0.71      0.83        14
      Placed       0.88      1.00      0.93        28

    accuracy                           0.90        42
   macro avg       0.94      0.86      0.88        42
weighted avg       0.92      0.90      0.90        42


Feature Importance:
 ssc_p       0.321189
hsc_p       0.196249
degree_p    0.190984
mba_p       0.149832
etest_p     0.099189
gender      0.042557
dtype: float64


In [28]:
rf_model.predict([[0,25,11,14,17,60]])


array(['Not Placed'], dtype=object)