# Importing necessary libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matoplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

%matplotlib inline
plt.switch_backend('agg')


sns.set(style='whitegrid', palette='muted', font_scale=1.2)

# Data Loading and Understanding the data

In [None]:
df= pd.read_csv('place your dataset path')

print(f"Dataset Shape: {df.shape}")
df.head(10)

# Data Cleaning and Preprocessing

## Checking for the missing values

In [None]:
missing_val = df.isnull().sum()
print(f"Missing vaues in each column : {missing_val}")



## Converting categorical to category (numerical)

In [None]:
categorical_cols = ['ENROLLMENT', 'GENDER', 'TYPE OF EDUCATIONAL INSTITUTION',
                    'EDUCATIONAL INSTITUTION', 'INSTITUTION STATUS', 'DEPARTMENT',
                    'PROVINCE', 'DISTRICT', 'CLASSIFICATION', 'CAMPUS', 'FACULTY',
                    'PROGRAM/MAJOR', 'SHIFT/SCHEDULE', 'BENEFIT DISCOUNTS',
                    'STUDY MODE', 'AGE RANGE OF ENROLLED STUDENT', 'DISABILITY']

for col in categorical_cols:
  df[col] = df[col].astype('category')

df.info() # Check the datatype

# Exploratory Data Analysis (EDA)
Start by examining the correlations between numeric features. Our numeric subset includes the following columns:

* TUITION PAYMENT MARCH 2022
* TUITION PAYMENT MARCH 2023
* NUMBER OF ENROLLED COURSES
* AT-RISK COURSE

In [None]:
# Creating numeric subset of the data
numeric_df = df.select_dtypes(include[np.number])

# Correlation heatmap
if numeric_df.shape[1] >= 4:
  plt.figure(figsize=(12,8))
  corr = numeric_df.corr()
  sns.heatmap(corr, annot = True, cmap = 'RdBu', fmt = '.3f' )
  plt.title('Correlation Heatmap')
  plt.tight_layout()
  plt.show()

In [None]:
# Histogram of TUITION PAYMENT MARCH 2023
plt.figure(figsize=(8, 6))
sns.histplot(df['TUITION PAYMENT MARCH 2023'], kde=True, color='skyblue')
plt.title('Distribution of TUITION PAYMENT MARCH 2023')
plt.tight_layout()
plt.show()

# Count plot (pie chart style) for GENDER
plt.figure(figsize=(8, 6))
sns.countplot(x='GENDER', data=df, palette='pastel')
plt.title('Count Plot of GENDER')
plt.tight_layout()
plt.show()

# Box plot for tuition payments
plt.figure(figsize=(10, 6))
sns.boxplot(data=numeric_df, palette='Set3')
plt.title('Box Plot for Numeric Features')
plt.tight_layout()
plt.show()

# Violin plot for tuition payments by GENDER (if applicable)
if 'GENDER' in df.columns:
    plt.figure(figsize=(10, 6))
    sns.violinplot(x='GENDER', y='TUITION PAYMENT MARCH 2023', data=df, palette='muted')
    plt.title('Violin Plot of TUITION PAYMENT MARCH 2023 by GENDER')
    plt.tight_layout()
    plt.show()

# Predictive Modeling

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Define the predictor and target variables
features = ['TUITION PAYMENT MARCH 2022', 'NUMBER OF ENROLLED COURSES', 'AT-RISK COURSE']
target = 'TUITION PAYMENT MARCH 2023'

In [None]:
# Drop rows with missing values in the column of interest
df_model = df.dropna(subset = features + [target])

X = df_model[features]
y = df_model[target]

In [None]:
# Split the data into train and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print('R² Score:', r2)
print('Mean Squared Error:', mse)

# **YOUR FURTHER WORK**
* Experimenting with more advanced modeling techniques such as Random Forests or Gradient Boosting Machines.
* Analyzing regional trends by grouping data based on departments or districts.