# EDA and Prediction of Global College Statisttics 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


## Load the dataset

In [None]:
# Load the dataset
df = pd.read_csv('/Users/riteshkumar/Downloads/ML projects/Visualization of Global College Stats/College Data.csv')

## Display the first few rows of the dataset

In [None]:
df.head()

## Summary statistics


In [None]:
df.describe()

## Check for missing values

In [None]:
df.isnull().sum()

## Plot histograms for each numerical feature

In [None]:
df.hist(bins=30, figsize=(15, 10))
plt.show()

## Define numerical and categorical columns

In [None]:
sns.set(style="whitegrid")

numerical_cols = ["Total Students", "Male", "Female", "CGPA", "Annual Family Income", 
                  "Research Papers Published", "Placement Rate", "Faculty Count"]
categorical_cols = ["Country", "Branch", "Sports"]

## Create subplots for numerical feature distributions

In [None]:
# Replace infinity values with NaN in the dataframe
df.replace([np.inf, -np.inf], np.nan, inplace=True)
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(18, 12))
fig.suptitle("Distribution of Numerical Features", fontsize=16)
axes = axes.flatten()

for i, col in enumerate(numerical_cols):
    sns.histplot(df[col], bins=30, kde=True, ax=axes[i])
    axes[i].set_title(col)

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

## Generate Correlation Heatmap

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(df[numerical_cols].corr(), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap")
plt.show()

## Generate Boxplots for Outlier Detection

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(18, 12))
fig.suptitle("Boxplots for Outlier Detection", fontsize=16)
axes = axes.flatten()

for i, col in enumerate(numerical_cols):
    sns.boxplot(y=df[col], ax=axes[i])
    axes[i].set_title(col)

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

## Count Plots for Categorical Features

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
fig.suptitle("Count Plots for Categorical Features", fontsize=16)

for i, col in enumerate(categorical_cols):
    sns.countplot(y=df[col], order=df[col].value_counts().index[:10], ax=axes[i])  # Show top 10 categories
    axes[i].set_title(f"Top 10 {col}")

plt.tight_layout()
plt.show()

## Check for missing or non-numeric values in selected features

In [None]:
selected_features = ["CGPA", "Placement Rate", "Annual Family Income", "Research Papers Published"]
df[selected_features].info(), df[selected_features].isnull().sum(), df[selected_features].dtypes

## Generate Pairplot without kde to avoid issues

In [None]:
# Replace infinity values with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)
sns.pairplot(df[selected_features], kind='scatter')
plt.show()

## Identify features and target variable

In [None]:
target = "Placement Rate"
features = ["CGPA", "Annual Family Income", "Research Papers Published", "Faculty Count"]


## Handle categorical variables (encoding)

In [None]:
categorical_features = ["Country", "Branch", "Sports"]
for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

## Update feature set to include encoded categorical variables

In [None]:
features.extend(categorical_features)

## Split dataset into training and testing sets

In [None]:
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Standardize numerical features

In [None]:
scaler = StandardScaler()
X_train[["CGPA", "Annual Family Income", "Research Papers Published", "Faculty Count"]] = scaler.fit_transform(
    X_train[["CGPA", "Annual Family Income", "Research Papers Published", "Faculty Count"]]
)
X_test[["CGPA", "Annual Family Income", "Research Papers Published", "Faculty Count"]] = scaler.transform(
    X_test[["CGPA", "Annual Family Income", "Research Papers Published", "Faculty Count"]]
)

## Train Linear Regression Model

In [None]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

## Train Random Forest Model

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

## Model Evaluation Function

In [None]:
# Model Evaluation Function
def evaluate_model(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name} Performance:\n MAE: {mae:.2f}, RMSE: {rmse:.2f}, R² Score: {r2:.2f}\n")


## Evaluate the models

In [None]:
evaluate_model(y_test, y_pred_rf, "standard scaler")
evaluate_model(y_test, y_pred_lr, "Linear Regression")
evaluate_model(y_test, y_pred_rf, "Random Forest")


## Feature Importance Plot (Random Forest)

In [None]:
plt.figure(figsize=(10, 5))
sns.barplot(x=rf_model.feature_importances_, y=features)
plt.title("Feature Importance (Random Forest)")
plt.show()

## Feature Importance Plot (Linear Model)

In [None]:
plt.figure(figsize=(10, 5))
sns.barplot(x=lr_model.coef_, y=features)
plt.title("Feature Importance (Linear Model)")
plt.show()

## Actual vs. Predicted Plot (Linear Regression)

In [None]:
plt.figure(figsize=(8, 5))
sns.scatterplot(x=y_test, y=y_pred_lr, alpha=0.6, color="blue")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color="red", linestyle="--")  # 45-degree line
plt.xlabel("Actual Placement Rate")
plt.ylabel("Predicted Placement Rate")
plt.title("Actual vs. Predicted (Linear Regression)")
plt.show()


## Actual vs. Predicted Plot (Random Forest)

In [None]:
plt.figure(figsize=(8, 5))
sns.scatterplot(x=y_test, y=y_pred_rf, alpha=0.6, color="green")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color="red", linestyle="--")  # 45-degree line
plt.xlabel("Actual Placement Rate")
plt.ylabel("Predicted Placement Rate")
plt.title("Actual vs. Predicted (Random Forest)")
plt.show()