In [68]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder , LabelEncoder , StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report , accuracy_score
from sklearn.ensemble import RandomForestClassifier
from ydata_profiling import ProfileReport

In [3]:
data = pd.read_csv('college_student_placement_dataset.csv')
data.head()

Unnamed: 0,College_ID,IQ,Prev_Sem_Result,CGPA,Academic_Performance,Internship_Experience,Extra_Curricular_Score,Communication_Skills,Projects_Completed,Placement
0,CLG0030,107,6.61,6.28,8,No,8,8,4,No
1,CLG0061,97,5.52,5.37,8,No,7,8,0,No
2,CLG0036,109,5.36,5.83,9,No,3,1,1,No
3,CLG0055,122,5.47,5.75,6,Yes,1,6,1,No
4,CLG0004,96,7.91,7.69,7,No,8,10,2,No


In [70]:
profile = ProfileReport(data, title="College Student Report" , explorative = True)
profile.to_file("College.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 9/9 [00:00<00:00, 473.45it/s]
  discretized_df.loc[:, column] = self._discretize_column(


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   College_ID              10000 non-null  object 
 1   IQ                      10000 non-null  int64  
 2   Prev_Sem_Result         10000 non-null  float64
 3   CGPA                    10000 non-null  float64
 4   Academic_Performance    10000 non-null  int64  
 5   Internship_Experience   10000 non-null  object 
 6   Extra_Curricular_Score  10000 non-null  int64  
 7   Communication_Skills    10000 non-null  int64  
 8   Projects_Completed      10000 non-null  int64  
 9   Placement               10000 non-null  object 
dtypes: float64(2), int64(5), object(3)
memory usage: 781.4+ KB


In [6]:
data.describe()

Unnamed: 0,IQ,Prev_Sem_Result,CGPA,Academic_Performance,Extra_Curricular_Score,Communication_Skills,Projects_Completed
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,99.4718,7.535673,7.532379,5.5464,4.9709,5.5618,2.5134
std,15.053101,1.447519,1.470141,2.873477,3.160103,2.900866,1.715959
min,41.0,5.0,4.54,1.0,0.0,1.0,0.0
25%,89.0,6.29,6.29,3.0,2.0,3.0,1.0
50%,99.0,7.56,7.55,6.0,5.0,6.0,3.0
75%,110.0,8.79,8.77,8.0,8.0,8.0,4.0
max,158.0,10.0,10.46,10.0,10.0,10.0,5.0


In [8]:
data.isna().sum()

College_ID                0
IQ                        0
Prev_Sem_Result           0
CGPA                      0
Academic_Performance      0
Internship_Experience     0
Extra_Curricular_Score    0
Communication_Skills      0
Projects_Completed        0
Placement                 0
dtype: int64

In [25]:
le = LabelEncoder()
data['Placement'] = le.fit_transform(data['Placement'])
data['Placement']

0       0
1       0
2       0
3       0
4       0
       ..
9995    1
9996    0
9997    0
9998    0
9999    0
Name: Placement, Length: 10000, dtype: int64

In [26]:
le = LabelEncoder()
data['Internship_Experience'] = le.fit_transform(data['Internship_Experience'])
data['Internship_Experience']

0       0
1       0
2       0
3       1
4       0
       ..
9995    0
9996    0
9997    1
9998    0
9999    0
Name: Internship_Experience, Length: 10000, dtype: int32

In [27]:
data

Unnamed: 0,College_ID,IQ,Prev_Sem_Result,CGPA,Academic_Performance,Internship_Experience,Extra_Curricular_Score,Communication_Skills,Projects_Completed,Placement
0,CLG0030,107,6.61,6.28,8,0,8,8,4,0
1,CLG0061,97,5.52,5.37,8,0,7,8,0,0
2,CLG0036,109,5.36,5.83,9,0,3,1,1,0
3,CLG0055,122,5.47,5.75,6,1,1,6,1,0
4,CLG0004,96,7.91,7.69,7,0,8,10,2,0
...,...,...,...,...,...,...,...,...,...,...
9995,CLG0021,119,8.41,8.29,4,0,1,8,0,1
9996,CLG0098,70,9.25,9.34,7,0,0,7,2,0
9997,CLG0066,89,6.08,6.25,3,1,3,9,5,0
9998,CLG0045,107,8.77,8.92,3,0,7,5,1,0


In [31]:
data = data.drop("College_ID" , axis = 1)

In [36]:
target = "Placement"
x = data.drop(target , axis = 1)
y = data[target]

print(x.shape)
print(y.shape)

(10000, 8)
(10000,)


In [62]:
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size = 0.2 , random_state = 42)

In [63]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(8000, 8)
(8000,)
(2000, 8)
(2000,)


In [64]:
scale = StandardScaler()

scale.fit_transform(x_train)
scale.transform(x_test)

array([[ 2.62002551, -0.16760364, -0.19648942, ...,  0.95107556,
        -0.53605303,  0.86091247],
       [-1.03216799,  0.51551065,  0.65285299, ..., -0.63029462,
        -0.88039414,  1.44280667],
       [-0.3681328 , -1.56833294, -1.27005822, ...,  0.63480153,
         1.52999366,  0.86091247],
       ...,
       [ 0.56151645,  0.90881887,  0.87028464, ...,  0.31852749,
         1.18565254, -0.88477013],
       [-0.10251873,  1.62643388,  1.42745326, ..., -0.63029462,
        -1.56907637, -1.46666434],
       [-1.0985715 , -0.26420566, -0.50225269, ..., -0.31402058,
         0.84131143, -1.46666434]])

In [65]:
model = RandomForestClassifier()
model.fit(x_train , y_train)

In [66]:
y_pred = model.predict(x_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [67]:
score = accuracy_score(y_pred , y_test)
score

1.0