### Linear Regression
Linear regression uses the relationship between the data-points to draw a straight line through all them.

This line can be used to predict future values.



In [10]:
import pandas as pd



# Load the dataset
df = pd.read_csv("datasets/pupil-enrollment-2010-2015.csv")



In [13]:
from sklearn.preprocessing import StandardScaler

# Load the data into X and y variables
X = df[['Total P1 Boys Enrollment']]  # Independent variable(s)
y = df['Total P1 Girls Enrollment']  # Dependent variable

# Print the shape of X and y to verify the dimensions
print("Shape of X before:", X.shape)
print("Shape of y before:", y.shape)

# Remove rows with NaN values from X and y
cleaned_data = pd.concat([X, y], axis=1).dropna()
X = cleaned_data[['Total P1 Boys Enrollment']]
y = cleaned_data['Total P1 Girls Enrollment']

# Scale the X data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Print the shape of X after scaling
print("Shape of X after:", X_scaled.shape)


Shape of X before: (11702, 1)
Shape of y before: (11702,)
Shape of X after: (11682, 1)


In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)
print(y_pred)

# Calculate R-squared score
r2 = r2_score(y_test, y_pred)
print("R-squared score:", r2)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error (RMSE):", rmse)

[57.266874   68.11762661 30.55732911 ... 44.74677483 25.54928945
 38.06938861]
R-squared score: 0.8045057295731027
Root Mean Squared Error (RMSE): 20.177570268214904
