In [6]:
!pip install -r requirements.txt



In [None]:
# ==============================================
# PISA 2015 UK ANALYSIS: PRIVATE VS PUBLIC SCHOOLS
# ==============================================
# This script analyzes:
# 1. Test score differences between private and public schools
# 2. Impact of home educational resources and family wealth
# 3. Effects of student-teacher ratio and school location

# Import essential libraries
import pandas as pd  # For data manipulation
import numpy as np   # For numerical operations
import matplotlib.pyplot as plt  # For plotting
import seaborn as sns  # For enhanced visualizations
from scipy import stats  # For t-tests and correlations
import statsmodels.api as sm  # For regression
from statsmodels.formula.api import ols  # For OLS regression
from sklearn.preprocessing import LabelEncoder  # For encoding categories
from sklearn.model_selection import train_test_split  # For splitting data
from sklearn.ensemble import RandomForestRegressor  # For feature importance
import warnings  # To suppress warnings

# Configuration for clean output and readable plots
warnings.filterwarnings('ignore')  # Suppress warnings
plt.style.use('ggplot')  # Use ggplot style to avoid seaborn style error
sns.set_palette("husl")  # Set color palette for plots
%matplotlib inline  # Ensure plots display in Jupyter

# =============================
# SECTION 1: DATA LOADING & CLEANING
# =============================
def load_and_clean_data(filepath):
    """
    Loads and cleans the PISA 2015 dataset
    Args:
        filepath (str): Path to CSV file
    Returns:
        DataFrame: Cleaned dataset or None if error
    """
    # Load dataset
    print("\n[1.1] Loading data...")
    try:
        df = pd.read_csv(filepath)
        print(f"Data shape: {df.shape}")
    except FileNotFoundError:
        print(f"Error: {filepath} not found.")
        return None
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

    # Check missing values
    print("\n[1.2] Missing values:")
    print(df[['math_score', 'read_score', 'scie_score', 'schltype', 'hedres', 
             'wealth', 'stratio', 'schllocation']].isnull().sum())

    # Handle missing values
    print("\n[1.3] Handling missing values...")
    # Drop rows with missing test scores
    df.dropna(subset=['math_score', 'read_score', 'scie_score'], inplace=True)
    # Impute numerical columns with median
    for col in ['hedres', 'wealth', 'stratio']:
        df[col].fillna(df[col].median(), inplace=True)
    # Impute categorical columns with mode
    for col in ['schltype', 'schllocation']:
        df[col].fillna(df[col].mode()[0], inplace=True)

    # Convert data types
    print("\n[1.4] Converting data types...")
    df['schltype'] = df['schltype'].astype('category')
    df['schllocation'] = df['schllocation'].astype('category')

    # Create composite score
    df['composite_score'] = df[['math_score', 'read_score', 'scie_score']].mean(axis=1)

    # Remove duplicates
    print("\n[1.5] Removing duplicates...")
    df = df.drop_duplicates(subset='stuid', keep='first')
    print(f"Cleaned data shape: {df.shape}")
    return df

# =============================
# SECTION 2: EXPLORATORY DATA ANALYSIS
# =============================
def perform_eda(df):
    """
    Performs exploratory data analysis with simple visualizations
    Args:
        df (DataFrame): Cleaned dataset
    """
    print("\n[2.1] Exploratory Data Analysis...")

    # Test score distributions
    print("\n[2.1.1] Test Score Distributions")
    plt.figure(figsize=(12, 4))
    for i, col in enumerate(['math_score', 'read_score', 'scie_score'], 1):
        plt.subplot(1, 3, i)
        sns.histplot(df[col], bins=20, kde=True)
        plt.title(col.replace('_', ' ').title())
        plt.xlabel("Score")
    plt.tight_layout()
    plt.show()

    # Scores by school type
    print("\n[2.1.2] Scores by School Type")
    plt.figure(figsize=(8, 5))
    sns.boxplot(x='schltype', y='composite_score', data=df)
    plt.title("Composite Score by School Type")
    plt.xlabel("School Type")
    plt.ylabel("Composite Score")
    plt.show()

    # Scores by resources and wealth
    print("\n[2.1.3] Scores by Resources and Wealth")
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    sns.scatterplot(x='hedres', y='composite_score', hue='schltype', data=df)
    plt.title("Scores by Home Resources")
    plt.xlabel("Home Resources")
    plt.subplot(1, 2, 2)
    sns.scatterplot(x='wealth', y='composite_score', hue='schltype', data=df)
    plt.title("Scores by Family Wealth")
    plt.xlabel("Family Wealth")
    plt.tight_layout()
    plt.show()

    # Scores by school factors
    print("\n[2.1.4] Scores by School Factors")
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    sns.scatterplot(x='stratio', y='composite_score', hue='schltype', data=df)
    plt.title("Scores by Student-Teacher Ratio")
    plt.xlabel("Student-Teacher Ratio")
    plt.subplot(1, 2, 2)
    sns.boxplot(x='schllocation', y='composite_score', data=df)
    plt.title("Scores by School Location")
    plt.xlabel("School Location")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# =============================
# SECTION 3: STATISTICAL TESTING
# =============================
def perform_statistical_tests(df):
    """
    Conducts statistical tests for all hypotheses
    Args:
        df (DataFrame): Cleaned dataset
    """
    print("\n[3.1] Statistical Tests...")

    # Hypothesis 1: Private vs. public school scores
    print("\n[3.1.1] Hypothesis 1: School Type Differences")
    for col in ['math_score', 'read_score', 'scie_score']:
        private = df[df['schltype'] == 'Private'][col]
        public = df[df['schltype'] == 'Public'][col]
        t_stat, p_val = stats.ttest_ind(private, public, equal_var=False)
        print(f"{col.replace('_', ' ').title()}:")
        print(f"  t-statistic: {t_stat:.2f}, p-value: {p_val:.4f}")
        print(f"  {'Significant' if p_val < 0.05 else 'Not significant'} difference")

    # Hypothesis 2: Controlling for resources and wealth
    print("\n[3.1.2] Hypothesis 2: Controlling for Resources and Wealth")
    formula = 'composite_score ~ C(schltype) + hedres + wealth'
    model = ols(formula, data=df).fit()
    print(model.summary().tables[1])  # Print coefficients table

    # Hypothesis 3: Student-teacher ratio effect
    print("\n[3.1.3] Hypothesis 3: Student-Teacher Ratio")
    corr, p_val = stats.pearsonr(df['stratio'], df['composite_score'])
    print(f"Correlation: r = {corr:.2f}, p-value = {p_val:.4f}")
    print(f"{'Significant' if p_val < 0.05 else 'Not significant'} correlation")

    # Hypothesis 4: School location effect
    print("\n[3.1.4] Hypothesis 4: School Location")
    locations = df['schllocation'].unique()
    groups = [df[df['schllocation'] == loc]['composite_score'] for loc in locations]
    f_stat, p_val = stats.f_oneway(*groups)
    print(f"ANOVA: F = {f_stat:.2f}, p-value = {p_val:.4f}")
    print(f"{'Significant' if p_val < 0.05 else 'Not significant'} difference")

# =============================
# SECTION 4: ADVANCED MODELING
# =============================
def perform_advanced_modeling(df):
    """
    Performs regression and feature importance analysis
    Args:
        df (DataFrame): Cleaned dataset
    """
    print("\n[4.1] Advanced Modeling...")

    # Encode categorical variables
    le = LabelEncoder()
    df['schltype_enc'] = le.fit_transform(df['schltype'])
    df['schllocation_enc'] = le.fit_transform(df['schllocation'])

    # Prepare data
    features = ['hedres', 'wealth', 'stratio', 'schltype_enc', 'schllocation_enc']
    X = df[features]
    y = df['composite_score']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Random Forest
    print("\n[4.1.1] Random Forest Feature Importance")
    rf = RandomForestRegressor(n_estimators=50, random_state=42)
    rf.fit(X_train, y_train)
    importance = pd.DataFrame({
        'Feature': features,
        'Importance': rf.feature_importances_
    }).sort_values('Importance', ascending=False)
    print(importance)

    # Plot feature importance
    plt.figure(figsize=(8, 5))
    sns.barplot(x='Importance', y='Feature', data=importance)
    plt.title("Feature Importance")
    plt.show()

# =============================
# SECTION 5: REPORTING
# =============================
def generate_report(df):
    """
    Generates a summary report
    Args:
        df (DataFrame): Cleaned dataset
    """
    print("\n[5.1] Summary Report...")

    # Key statistics
    print("\nTotal students:", len(df))
    print("Private schools:", len(df[df['schltype'] == 'Private']))
    print("Public schools:", len(df[df['schltype'] == 'Public']))

    # Average scores
    print("\nAverage Scores by School Type:")
    print(df.groupby('schltype')[['math_score', 'read_score', 'scie_score']].mean().round(2))

    # Correlation matrix
    print("\nCorrelation Matrix:")
    corr = df[['math_score', 'read_score', 'scie_score', 'hedres', 'wealth', 'stratio']].corr()
    plt.figure(figsize=(6, 5))
    sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title("Correlation Matrix")
    plt.show()

# =============================
# MAIN EXECUTION
# =============================
if __name__ == "__main__":
    # Load and clean data
    filepath = "dataset.csv"  # Update with your file path
    df = load_and_clean_data(filepath)

    # Run analysis if data is loaded
    if df is not None:
        perform_eda(df)
        perform_statistical_tests(df)
        perform_advanced_modeling(df)
        generate_report(df)
        df.to_csv('cleaned_data.csv', index=False)
        print("\n[COMPLETE] Analysis done! Cleaned data saved as 'cleaned_data.csv'.")
    else:
        print("\n[ERROR] Analysis failed due to data loading issues.")

UsageError: unrecognized arguments: # Ensure plots display in Jupyter Notebook
