In [1]:
!pip install pandas numpy scipy statsmodels matplotlib seaborn



In [3]:
import pandas as pd
import numpy as np
from scipy.optimize import minimize
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
np.random.seed(42)
hours_studied = np.random.normal(4, 3, 100)
sleep_hours = np.random.normal(8, 3.5, 100)

In [9]:
exam_score = 10 * hours_studied + 2 * sleep_hours + np.random.normal(0, 5, 100)

df = pd.DataFrame({
    'hours_studied': hours_studied,
    'sleep_hours': sleep_hours,
    'exam_score': exam_score
})

In [11]:
missing = df.isnull().sum()

In [13]:
from scipy.stats import zscore
z_scores = np.abs(zscore(df))
outliers = (z_scores > 3).any(axis=1)
outlier_report = df[outliers]

In [15]:
cleaned_df = df[~outliers]

In [17]:
cleaned_df.to_csv("cleaned_data.csv", index=False)
print("Missing values:\n", missing)
print("\nOutliers:\n", outlier_report)

Missing values:
 hours_studied    0
sleep_hours      0
exam_score       0
dtype: int64

Outliers:
 Empty DataFrame
Columns: [hours_studied, sleep_hours, exam_score]
Index: []


In [19]:
correlation_matrix = cleaned_df[['hours_studied', 'sleep_hours']].corr()
print("Correlation Matrix:\n", correlation_matrix)

Correlation Matrix:
                hours_studied  sleep_hours
hours_studied       1.000000    -0.136422
sleep_hours        -0.136422     1.000000


In [21]:
features = cleaned_df[['hours_studied', 'sleep_hours']]
standardized_features = (features - features.mean()) / features.std()

In [23]:
X = np.c_[np.ones(len(standardized_features)), standardized_features]
y = cleaned_df['exam_score'].values

In [25]:
def mse(beta):
    predictions = X @ beta
    return np.mean((y - predictions) ** 2)

In [27]:
initial_beta = np.zeros(X.shape[1])

In [None]:
result = minimize(mse, initial_beta)
beta_opt = result.x