In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Reading the CSV file directly
df = pd.read_csv("Placement_Data_Full_Class.csv")

# Data Preprocessing
# Dropping unnecessary columns
df = df.drop(['sl_no', 'salary'], axis=1)

# Handling missing values (if any)
df = df.dropna()

# Encoding categorical variables
le = LabelEncoder()
categorical_cols = ['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation', 'status']
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# Separating features and target
X = df.drop('status', axis=1)
y = df['status']

# Scaling numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Training Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Making predictions and calculating probabilities
y_pred = rf_model.predict(X_test)
y_prob = rf_model.predict_proba(X_test)[:, 1]  # Probability of being 'Placed'

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Not Placed', 'Placed']))

# Feature Importance Plot
plt.figure(figsize=(10, 6))
feature_importance = pd.Series(rf_model.feature_importances_, index=X.columns)
feature_importance.sort_values(ascending=False).plot(kind='bar')
plt.title('Feature Importance in Placement Prediction')
plt.xlabel('Features')
plt.ylabel('Importance Score')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

# Distribution of Prediction Probabilities
plt.figure(figsize=(10, 6))
sns.histplot(y_prob, bins=20, kde=True)
plt.title('Distribution of Placement Probability')
plt.xlabel('Probability of Being Placed')
plt.ylabel('Count')
plt.savefig('probability_distribution.png')
plt.close()

# Scatter Plot: SSC Percentage vs MBA Percentage colored by Status
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='ssc_p', y='mba_p', hue=le.inverse_transform(y), style=le.inverse_transform(y))
plt.title('SSC Percentage vs MBA Percentage by Placement Status')
plt.xlabel('SSC Percentage')
plt.ylabel('MBA Percentage')
plt.legend(title='Status')
plt.savefig('ssc_vs_mba.png')
plt.close()

# Box Plot: Degree Percentage by Placement Status
plt.figure(figsize=(10, 6))
sns.boxplot(x=le.inverse_transform(y), y=df['degree_p'])
plt.title('Degree Percentage Distribution by Placement Status')
plt.xlabel('Placement Status')
plt.ylabel('Degree Percentage')
plt.savefig('degree_boxplot.png')
plt.close()

# Correlation Heatmap
plt.figure(figsize=(12, 8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap of Features')
plt.savefig('correlation_heatmap.png')
plt.close()

print("\nGenerated Visualizations:")
print("- feature_importance.png: Shows which features most influence placement")
print("- probability_distribution.png: Distribution of predicted placement probabilities")
print("- ssc_vs_mba.png: Scatter plot of SSC vs MBA percentages by placement status")
print("- degree_boxplot.png: Degree percentage distribution by placement status")
print("- correlation_heatmap.png: Correlation between all features")

Model Accuracy: 0.79

Classification Report:
              precision    recall  f1-score   support

  Not Placed       0.67      0.50      0.57        12
      Placed       0.82      0.90      0.86        31

    accuracy                           0.79        43
   macro avg       0.75      0.70      0.72        43
weighted avg       0.78      0.79      0.78        43


Generated Visualizations:
- feature_importance.png: Shows which features most influence placement
- probability_distribution.png: Distribution of predicted placement probabilities
- ssc_vs_mba.png: Scatter plot of SSC vs MBA percentages by placement status
- degree_boxplot.png: Degree percentage distribution by placement status
- correlation_heatmap.png: Correlation between all features


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Reading the CSV file
df = pd.read_csv("Placement_Data_Full_Class.csv")

# Data Preprocessing
# Dropping unnecessary columns
df = df.drop(['sl_no', 'salary'], axis=1)

# Handling missing values
df = df.dropna()

# Initialize LabelEncoders for categorical columns
label_encoders = {}
categorical_cols = ['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation', 'status']

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Separating features and target
X = df.drop('status', axis=1)
y = df['status']

# Scaling numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Training Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluating the model
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Not Placed', 'Placed']))

# Function to predict placement for new raw input
def predict_placement(raw_input):
    """
    Predict placement status for a new raw input.
    Input should be a dictionary with keys matching the dataset columns (excluding sl_no, salary, status).
    Returns the predicted status ('Placed' or 'Not Placed') and probability.
    """
    # Expected columns in order
    columns = ['gender', 'ssc_p', 'ssc_b', 'hsc_p', 'hsc_b', 'hsc_s', 'degree_p', 'degree_t', 'workex', 'etest_p', 'specialisation', 'mba_p']

    # Create a DataFrame from the input
    input_df = pd.DataFrame([raw_input], columns=columns)

    # Encode categorical variables
    for col in ['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation']:
        try:
            # Transform using the trained LabelEncoder
            input_df[col] = label_encoders[col].transform(input_df[col])
        except ValueError:
            raise ValueError(f"Invalid value for {col}. Must be one of {list(label_encoders[col].classes_)}")

    # Scale numerical features
    numerical_cols = ['ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p']
    input_scaled = scaler.transform(input_df)

    # Make prediction
    prediction = rf_model.predict(input_scaled)[0]
    probability = rf_model.predict_proba(input_scaled)[0][1]  # Probability of 'Placed'

    # Convert numerical prediction back to string
    predicted_status = label_encoders['status'].inverse_transform([prediction])[0]

    return predicted_status, probability

# Example usage of the prediction function
example_input = {
    'gender': 'M',
    'ssc_p': 75.0,
    'ssc_b': 'Central',
    'hsc_p': 70.0,
    'hsc_b': 'Others',
    'hsc_s': 'Science',
    'degree_p': 68.0,
    'degree_t': 'Sci&Tech',
    'workex': 'Yes',
    'etest_p': 80.0,
    'specialisation': 'Mkt&Fin',
    'mba_p': 65.0
}

try:
    status, prob = predict_placement(example_input)
    print(f"\nPrediction for example input:")
    print(f"Placement Status: {status}")
    print(f"Probability of Placement: {prob:.2%}")
except ValueError as e:
    print(f"Error: {e}")

# Visualizations (unchanged from previous code)
# Feature Importance Plot
plt.figure(figsize=(10, 6))
feature_importance = pd.Series(rf_model.feature_importances_, index=X.columns)
feature_importance.sort_values(ascending=False).plot(kind='bar')
plt.title('Feature Importance in Placement Prediction')
plt.xlabel('Features')
plt.ylabel('Importance Score')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

# Distribution of Prediction Probabilities
y_prob = rf_model.predict_proba(X_test)[:, 1]
plt.figure(figsize=(10, 6))
sns.histplot(y_prob, bins=20, kde=True)
plt.title('Distribution of Placement Probability')
plt.xlabel('Probability of Being Placed')
plt.ylabel('Count')
plt.savefig('probability_distribution.png')
plt.close()

# Scatter Plot: SSC Percentage vs MBA Percentage colored by Status
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='ssc_p', y='mba_p', hue=label_encoders['status'].inverse_transform(y), style=label_encoders['status'].inverse_transform(y))
plt.title('SSC Percentage vs MBA Percentage by Placement Status')
plt.xlabel('SSC Percentage')
plt.ylabel('MBA Percentage')
plt.legend(title='Status')
plt.savefig('ssc_vs_mba.png')
plt.close()

# Box Plot: Degree Percentage by Placement Status
plt.figure(figsize=(10, 6))
sns.boxplot(x=label_encoders['status'].inverse_transform(y), y=df['degree_p'])
plt.title('Degree Percentage Distribution by Placement Status')
plt.xlabel('Placement Status')
plt.ylabel('Degree Percentage')
plt.savefig('degree_boxplot.png')
plt.close()

# Correlation Heatmap
plt.figure(figsize=(12, 8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap of Features')
plt.savefig('correlation_heatmap.png')
plt.close()

print("\nGenerated Visualizations:")
print("- feature_importance.png: Shows which features most influence placement")
print("- probability_distribution.png: Distribution of predicted placement probabilities")
print("- ssc_vs_mba.png: Scatter plot of SSC vs MBA percentages by placement status")
print("- degree_boxplot.png: Degree percentage distribution by placement status")
print("- correlation_heatmap.png: Correlation between all features")

Model Accuracy: 0.79

Classification Report:
              precision    recall  f1-score   support

  Not Placed       0.67      0.50      0.57        12
      Placed       0.82      0.90      0.86        31

    accuracy                           0.79        43
   macro avg       0.75      0.70      0.72        43
weighted avg       0.78      0.79      0.78        43


Prediction for example input:
Placement Status: Placed
Probability of Placement: 100.00%

Generated Visualizations:
- feature_importance.png: Shows which features most influence placement
- probability_distribution.png: Distribution of predicted placement probabilities
- ssc_vs_mba.png: Scatter plot of SSC vs MBA percentages by placement status
- degree_boxplot.png: Degree percentage distribution by placement status
- correlation_heatmap.png: Correlation between all features


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

# Reading the CSV file
df = pd.read_csv("Placement_Data_Full_Class.csv")

# Data Preprocessing
# Dropping unnecessary columns
df = df.drop(['sl_no', 'salary'], axis=1)

# Handling missing values
df = df.dropna()

# Initialize LabelEncoders for categorical columns
label_encoders = {}
categorical_cols = ['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex', 'specialisation', 'status']

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Separating features and target
X = df.drop('status', axis=1)
y = df['status']

# Scaling numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Training Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluating the model
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Not Placed', 'Placed']))

# Saving the model, scaler, and label encoders
joblib.dump(rf_model, 'placement_model.joblib')
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(label_encoders, 'label_encoders.joblib')
print("\nModel and preprocessing objects saved as:")
print("- placement_model.joblib (trained model)")
print("- scaler.joblib (scaler for numerical features)")
print("- label_encoders.joblib (encoders for categorical features)")

# Visualizations
# Feature Importance Plot
plt.figure(figsize=(10, 6))
feature_importance = pd.Series(rf_model.feature_importances_, index=X.columns)
feature_importance.sort_values(ascending=False).plot(kind='bar')
plt.title('Feature Importance in Placement Prediction')
plt.xlabel('Features')
plt.ylabel('Importance Score')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

# Distribution of Prediction Probabilities
y_prob = rf_model.predict_proba(X_test)[:, 1]
plt.figure(figsize=(10, 6))
sns.histplot(y_prob, bins=20, kde=True)
plt.title('Distribution of Placement Probability')
plt.xlabel('Probability of Being Placed')
plt.ylabel('Count')
plt.savefig('probability_distribution.png')
plt.close()

# Scatter Plot: SSC Percentage vs MBA Percentage colored by Status
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='ssc_p', y='mba_p', hue=label_encoders['status'].inverse_transform(y), style=label_encoders['status'].inverse_transform(y))
plt.title('SSC Percentage vs MBA Percentage by Placement Status')
plt.xlabel('SSC Percentage')
plt.ylabel('MBA Percentage')
plt.legend(title='Status')
plt.savefig('ssc_vs_mba.png')
plt.close()

# Box Plot: Degree Percentage by Placement Status
plt.figure(figsize=(10, 6))
sns.boxplot(x=label_encoders['status'].inverse_transform(y), y=df['degree_p'])
plt.title('Degree Percentage Distribution by Placement Status')
plt.xlabel('Placement Status')
plt.ylabel('Degree Percentage')
plt.savefig('degree_boxplot.png')
plt.close()

# Correlation Heatmap
plt.figure(figsize=(12, 8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap of Features')
plt.savefig('correlation_heatmap.png')
plt.close()

print("\nGenerated Visualizations:")
print("- feature_importance.png: Shows which features most influence placement")
print("- probability_distribution.png: Distribution of predicted placement probabilities")
print("- ssc_vs_mba.png: Scatter plot of SSC vs MBA percentages by placement status")
print("- degree_boxplot.png: Degree percentage distribution by placement status")
print("- correlation_heatmap.png: Correlation between all features")

Model Accuracy: 0.79

Classification Report:
              precision    recall  f1-score   support

  Not Placed       0.67      0.50      0.57        12
      Placed       0.82      0.90      0.86        31

    accuracy                           0.79        43
   macro avg       0.75      0.70      0.72        43
weighted avg       0.78      0.79      0.78        43


Model and preprocessing objects saved as:
- placement_model.joblib (trained model)
- scaler.joblib (scaler for numerical features)
- label_encoders.joblib (encoders for categorical features)

Generated Visualizations:
- feature_importance.png: Shows which features most influence placement
- probability_distribution.png: Distribution of predicted placement probabilities
- ssc_vs_mba.png: Scatter plot of SSC vs MBA percentages by placement status
- degree_boxplot.png: Degree percentage distribution by placement status
- correlation_heatmap.png: Correlation between all features
