Assignment 5

In [25]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Step 1: Importing and Merging Data
try:
    # Use try/except to handle potential file not found errors
    churn_data = pd.read_csv("churn_data.csv")
    customer_data = pd.read_csv("customer_data.csv")
    internet_data = pd.read_csv("internet_data.csv")
    
    df_1 = pd.merge(churn_data, customer_data, how='inner', on='customerID')
    telecom = pd.merge(df_1, internet_data, how='inner', on='customerID')
    
    print("Data loaded and merged successfully.")
    print(f"Dataset shape: {telecom.shape}")
except Exception as e:
    print(f"Error loading data: {e}")
    # Since this is a code sample, we'll continue assuming data is loaded

# Step 2: Initial Data Inspection
print("\n--- First 5 rows of data ---")
print(telecom.head())

print("\n--- Data Types ---")
print(telecom.dtypes)

# Step 3: Data Preparation
# First, save customerID for later reference
customer_ids = telecom['customerID'].copy()

# Convert binary categorical variables to numeric
binary_vars = ['PhoneService', 'PaperlessBilling', 'Churn', 'Partner', 'Dependents']
for var in binary_vars:
    if var in telecom.columns:
        telecom[var] = telecom[var].map({'Yes': 1, 'No': 0})
        # Ensure proper numeric type
        telecom[var] = pd.to_numeric(telecom[var], errors='coerce')

# Handle 'TotalCharges' column
if 'TotalCharges' in telecom.columns:
    # First, check if it's already numeric
    if not pd.api.types.is_numeric_dtype(telecom['TotalCharges']):
        telecom['TotalCharges'] = pd.to_numeric(telecom['TotalCharges'], errors='coerce')
    
    # Drop rows with missing TotalCharges
    telecom = telecom.dropna(subset=['TotalCharges'])
    print(f"Dropped {len(customer_ids) - len(telecom)} rows with missing TotalCharges")

# Create dummy variables for categorical columns
categorical_cols = [
    'Contract', 'PaymentMethod', 'gender', 'InternetService',
    'MultipleLines', 'OnlineSecurity', 'OnlineBackup', 
    'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies'
]

# Filter to only include columns that actually exist in the dataset
categorical_cols = [col for col in categorical_cols if col in telecom.columns]

# Create dummies for all categorical columns at once
if categorical_cols:
    dummies = pd.get_dummies(telecom[categorical_cols], drop_first=True)
    telecom = pd.concat([telecom, dummies], axis=1)
    
    # Drop original categorical columns
    telecom = telecom.drop(categorical_cols, axis=1)

# Drop customerID before modeling
if 'customerID' in telecom.columns:
    telecom = telecom.drop('customerID', axis=1)

# Step 4: Split data into features and target
if 'Churn' in telecom.columns:
    X = telecom.drop('Churn', axis=1)
    y = telecom['Churn']
    
    # Ensure y is a proper numeric vector
    y = pd.to_numeric(y, errors='coerce')
    y = y.fillna(0).astype(int)  # Fill any NAs and convert to int
else:
    print("Error: 'Churn' column not found in the dataset.")
    # For this example, we'll just continue

# Step 5: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=100)

# Step 6: Feature Scaling (only for numeric columns)
numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
if 'tenure' in numeric_cols and 'MonthlyCharges' in numeric_cols and 'TotalCharges' in numeric_cols:
    scaler = StandardScaler()
    X_train[['tenure', 'MonthlyCharges', 'TotalCharges']] = scaler.fit_transform(X_train[['tenure', 'MonthlyCharges', 'TotalCharges']])
    
    # Apply the same transformation to test data
    X_test[['tenure', 'MonthlyCharges', 'TotalCharges']] = scaler.transform(X_test[['tenure', 'MonthlyCharges', 'TotalCharges']])

# Step 7: Final Data Preparation for Modeling
# Convert all feature columns to numeric explicitly
for col in X_train.columns:
    X_train[col] = pd.to_numeric(X_train[col], errors='coerce')
    X_test[col] = pd.to_numeric(X_test[col], errors='coerce')

# Fill any remaining NaN values with the mean of each column
X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_test.mean())

# Reset indices to ensure alignment
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Step 8: Check data readiness
print("\n--- Data Preparation Summary ---")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_train has NaN values: {X_train.isna().any().any()}")
print(f"y_train has NaN values: {y_train.isna().any()}")
print(f"X_train dtypes: {X_train.dtypes.value_counts()}")

# Step 9: Convert to NumPy arrays (which works better with statsmodels)
X_train_np = X_train.to_numpy()
y_train_np = y_train.to_numpy()

print(f"X_train_np dtype: {X_train_np.dtype}")
print(f"y_train_np dtype: {y_train_np.dtype}")

# Step 10: Build the model
print("\n--- Model Building ---")

# Add constant (intercept term)
X_train_const = sm.add_constant(X_train_np)

# Method 1: Try statsmodels GLM
try:
    print("Attempting statsmodels GLM...")
    
    # Force data to be float64
    X_train_const = X_train_const.astype(np.float64)
    y_train_np = y_train_np.astype(np.float64)
    
    logm1 = sm.GLM(y_train_np, X_train_const, family=sm.families.Binomial())
    result = logm1.fit()
    
    print("GLM model built successfully!")
    print("\n--- Model Summary ---")
    print(result.summary())
    
except Exception as e:
    print(f"Error with statsmodels GLM: {e}")
    
    # Method 2: Try statsmodels Logit as an alternative
    try:
        print("\nAttempting statsmodels Logit...")
        logit_model = sm.Logit(y_train_np, X_train_const)
        result = logit_model.fit()
        
        print("Logit model built successfully!")
        print("\n--- Model Summary ---")
        print(result.summary())
        
    except Exception as e2:
        print(f"Error with statsmodels Logit: {e2}")
        
        # Method 3: Fallback to sklearn's LogisticRegression
        try:
            print("\nFalling back to sklearn's LogisticRegression...")
            lr = LogisticRegression(max_iter=1000, random_state=100)
            lr.fit(X_train_np, y_train_np)
            
            print("sklearn LogisticRegression model built successfully!")
            
            # Calculate feature importance
            feature_importance = pd.DataFrame({
                'Feature': X_train.columns,
                'Coefficient': lr.coef_[0]
            })
            
            print("\n--- Top 10 Important Features ---")
            print(feature_importance.sort_values('Coefficient', ascending=False).head(10))
            
            # Calculate and print model accuracy
            train_accuracy = lr.score(X_train_np, y_train_np)
            test_accuracy = lr.score(X_test.to_numpy(), y_test.to_numpy())
            
            print(f"\nTraining accuracy: {train_accuracy:.4f}")
            print(f"Testing accuracy: {test_accuracy:.4f}")
            
        except Exception as e3:
            print(f"Error with sklearn LogisticRegression: {e3}")
            print("All modeling approaches failed. Please check your data preprocessing steps.")

Data loaded and merged successfully.
Dataset shape: (7043, 21)

--- First 5 rows of data ---
   customerID  tenure PhoneService        Contract PaperlessBilling  \
0  7590-VHVEG       1           No  Month-to-month              Yes   
1  5575-GNVDE      34          Yes        One year               No   
2  3668-QPYBK       2          Yes  Month-to-month              Yes   
3  7795-CFOCW      45           No        One year               No   
4  9237-HQITU       2          Yes  Month-to-month              Yes   

               PaymentMethod  MonthlyCharges TotalCharges Churn  gender  ...  \
0           Electronic check           29.85        29.85    No  Female  ...   
1               Mailed check           56.95       1889.5    No    Male  ...   
2               Mailed check           53.85       108.15   Yes    Male  ...   
3  Bank transfer (automatic)           42.30      1840.75    No    Male  ...   
4           Electronic check           70.70       151.65   Yes  Female  ...   
