In [None]:
import pandas as pd
import numpy as np
import os
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.stats.mstats import winsorize
from imblearn.over_sampling import SMOTE

In [None]:
df = pd.read_csv('/home/shadowfiend/Documents/USTH/Year_3/Internship/DeffendThesis/Defend-Thesis/Data/Data_UF_augmented.csv', encoding='latin1')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28623 entries, 0 to 28622
Data columns (total 31 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Contract_ID                26593 non-null  object 
 1   Gender                     28623 non-null  int64  
 2   Age                        28623 non-null  int64  
 3   Region                     26593 non-null  object 
 4   Managing Branch            26593 non-null  object 
 5   Sales Unit                 9272 non-null   object 
 6   Selling branch             24832 non-null  object 
 7   Sales Region               24832 non-null  object 
 8   Creation Date              28623 non-null  object 
 9   OrderID/RegCode            21282 non-null  object 
 10  Transaction Code           1255 non-null   object 
 11  Type                       20462 non-null  object 
 12  FG Package Command         23622 non-null  object 
 13  Payment Month              28623 non-null  int

In [None]:
# 2. Tạo label Churn và drop cột leak
df['Churn'] = df['Cancellation Date'].notna().astype(int)
df.drop(columns=['Cancellation Date'], inplace=True)

In [None]:
# 3. Chọn đúng các biến quan tâm
features = [
    'Customer Satisfaction',
    'Complaints Count',
    'Support Call Count',
    'Ping Issue Count',
    'Avg Download Speed (Mbps)',
    'Avg Upload Speed (Mbps)',
    'Promotion Used',
    'Data Usage (GB)'
]
X_raw = df[features].copy()
y = df['Churn']


In [None]:
# 3. Handle missing and encode
X_raw['Promotion Used'] = X_raw['Promotion Used'].fillna('None')
for col in X_raw.select_dtypes(include=['number']).columns:
    X_raw[col] = X_raw[col].fillna(X_raw[col].median())
X = pd.get_dummies(X_raw, columns=['Promotion Used'], drop_first=True)


In [None]:
# 4. Outlier treatment: log-transform & winsorize
for col in ['Complaints Count', 'Support Call Count', 'Ping Issue Count', 'Data Usage (GB)']:
    X[col] = np.log1p(X[col])
    X[col] = winsorize(X[col], limits=[0.01, 0.01])

In [None]:
# 5. Split into train/val/test (60/20/20)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, stratify=y_temp, random_state=42
)

In [None]:
# 6. Scale numeric features
numeric_cols = X_train.select_dtypes(include=['float64','int64']).columns.tolist()
scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_val[numeric_cols]   = scaler.transform(X_val[numeric_cols])
X_test[numeric_cols]  = scaler.transform(X_test[numeric_cols])



In [None]:
# 7. VIF elimination to remove multicollinearity
def calculate_vif(df):
    mat = df.astype(float).values
    return pd.DataFrame([{
        'variable': df.columns[i],
        'VIF': variance_inflation_factor(mat, i)
    } for i in range(mat.shape[1])])

def vif_eliminate(df, thresh=5.0):
    X_mod = df.copy().astype(float)
    while True:
        vif_df = calculate_vif(X_mod)
        high = vif_df[vif_df['VIF'] > thresh]
        if high.empty:
            break
        drop_var = high.sort_values('VIF', ascending=False).iloc[0]['variable']
        X_mod.drop(columns=[drop_var], inplace=True)
        print(f"Dropped '{drop_var}' (VIF={high.iloc[0]['VIF']:.2f})")
    return X_mod.columns.tolist()

selected_cols = vif_eliminate(X_train)
X_train = X_train[selected_cols].astype(float)
X_val   = X_val[selected_cols].astype(float)
X_test  = X_test[selected_cols].astype(float)

In [None]:
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)
model = LogisticRegression(max_iter=1000)
model.fit(X_res, y_res)

In [None]:
# 8. Hyperparameter tuning: find best C via GridSearchCV with class_weight
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
lr = LogisticRegression(penalty='l2', solver='liblinear',
                        class_weight='balanced', max_iter=1000)
grid = GridSearchCV(lr, param_grid, scoring='f1', cv=5)
grid.fit(X_train, y_train)
best = grid.best_estimator_


In [None]:
# 9. Threshold tuning on validation set
probs_val = best.predict_proba(X_val)[:,1]
ths  = np.linspace(0.1,0.9,81)
f1s  = [f1_score(y_val,(probs_val>=t).astype(int)) for t in ths]
best_t = ths[np.argmax(f1s)]

In [None]:
X_sm    = sm.add_constant(X_train)
logit   = sm.Logit(y_train, X_sm).fit(disp=False)
coef    = logit.params.drop('const')
pvals   = logit.pvalues.drop('const')
odds    = np.exp(coef)

vif_all = pd.DataFrame([
    {'Variable': X_train.columns[i],
     'VIF'     : variance_inflation_factor(X_train.values,i)}
    for i in range(X_train.shape[1])
])

full = pd.DataFrame({
    'Variable'   : coef.index,
    'Coefficient': coef.values,
    'p_value'    : pvals.values,
    'Odds_Ratio' : odds.values
}).merge(vif_all, on='Variable')

full.to_csv('Model_Summary_full.csv', index=False)
filtered = full[(full.p_value<=0.05)&(full.VIF<=5)]
filtered.to_csv('Model_Summary_filtered.csv', index=False)

In [None]:
# --- 3. Build Risk_Scores.csv ---
ids    = df.index  # or replace with df['CustomerID']
probs_t= best.predict_proba(X_test)[:,1]
labels = (probs_t>=best_t).astype(int)
def grp(p):
    return 'High' if p>=0.5 else ('Medium' if p>=best_t else 'Low')

risk = X_test.reset_index().rename(columns={'index':'CustomerID'})[selected_cols]
risk[numeric_cols] = scaler.transform(risk[numeric_cols])
risk['Pred_Probability'] = probs_t
risk['Pred_Label']       = labels
risk['Risk_Group']       = [grp(p) for p in probs_t]
risk.to_csv('Risk_Scores.csv', index=False)

print("Exported Model_Summary_full.csv, Model_Summary_filtered.csv, and Risk_Scores.csv")

Exported Model_Summary_full.csv, Model_Summary_filtered.csv, and Risk_Scores.csv
