In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# -----------------------
# 1. Load Dataset
# -----------------------
csv_path = "./archive/census2011.csv"    # update if needed
df = pd.read_csv(csv_path)

# -----------------------
# 2. Clean numeric columns
# -----------------------
df["Population"] = df["Population"].str.replace(",", "").astype(float)
df["Growth"] = df["Growth"].str.replace("%", "").astype(float)
df["Sex-Ratio"] = df["Sex-Ratio"].astype(float)
df["Literacy"] = df["Literacy"].astype(float)

# -----------------------
# 3. Select Features & Target
# -----------------------
X = df[["Population", "Growth", "Sex-Ratio"]]
y = df["Literacy"]

# -----------------------
# 4. Train-test split
# -----------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

# -----------------------
# 5. Train the Linear Regression model
# -----------------------
model = LinearRegression()
model.fit(X_train, y_train)

# -----------------------
# 6. Predict
# -----------------------
y_pred = model.predict(X_test)

# -----------------------
# 7. Metrics
# -----------------------
print("\n--- Model Evaluation ---")
print(f"MSE  : {mean_squared_error(y_test, y_pred):.3f}")
print(f"MAE  : {mean_absolute_error(y_test, y_pred):.3f}")
print(f"R²   : {r2_score(y_test, y_pred):.3f}")

# -----------------------
# 8. Example prediction
# -----------------------
sample = pd.DataFrame({
    "Population": [11060148],
    "Growth": [36.01],
    "Sex-Ratio": [886]
})

print("\nPredicted Literacy for sample:")
print(model.predict(sample)[0])



--- Model Evaluation ---
MSE  : 103.450
MAE  : 8.366
R²   : 0.037

Predicted Literacy for sample:
71.80820981928578
