<a href="https://colab.research.google.com/github/SindhuKamakshi/customer-renewal-prediction/blob/main/saas_churn_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**1.PROBLEM STATEMENT:**
  The goal of this project is to predict whether a SaaS customer will renew or cancel their subscription using machine learning.

In [3]:
import zipfile

with zipfile.ZipFile('saas-churn.zip', 'r') as zip_ref:
    zip_ref.extractall()

**2.DATASET DESCRIPTION:**             
**Source:** Kaggle SaaS churn dataset
,Number of records
,**Target column:** churn_flag

In [4]:
import pandas as pd

df = pd.read_csv('ravenstack_subscriptions.csv')
df.head()

Unnamed: 0,subscription_id,account_id,start_date,end_date,plan_tier,seats,mrr_amount,arr_amount,is_trial,upgrade_flag,downgrade_flag,churn_flag,billing_frequency,auto_renew_flag
0,S-8cec59,A-3c1a3f,2023-12-23,2024-04-12,Enterprise,14,2786,33432,False,False,False,True,monthly,True
1,S-0f6f44,A-9b9fe9,2024-06-11,,Pro,17,833,9996,False,False,False,False,monthly,True
2,S-51c0d1,A-659280,2024-11-25,,Enterprise,62,0,0,True,True,False,False,annual,False
3,S-f81687,A-e7a1e2,2024-11-23,2024-12-13,Enterprise,5,995,11940,False,False,False,True,monthly,True
4,S-cff5a2,A-ba6516,2024-01-10,,Enterprise,27,5373,64476,False,False,False,False,monthly,True


In [5]:
df.shape

(5000, 14)

**3.DATA PREPROCESING:**        
Checked missing values     
Handled categorical features     
Converted data types



In [6]:
df.isnull().sum()

Unnamed: 0,0
subscription_id,0
account_id,0
start_date,0
end_date,4514
plan_tier,0
seats,0
mrr_amount,0
arr_amount,0
is_trial,0
upgrade_flag,0


In [7]:
df['churn_flag'].value_counts()

Unnamed: 0_level_0,count
churn_flag,Unnamed: 1_level_1
False,4514
True,486


In [8]:
df_model = df.drop(columns=['subscription_id', 'account_id', 'start_date', 'end_date'])

df_model.head()

Unnamed: 0,plan_tier,seats,mrr_amount,arr_amount,is_trial,upgrade_flag,downgrade_flag,churn_flag,billing_frequency,auto_renew_flag
0,Enterprise,14,2786,33432,False,False,False,True,monthly,True
1,Pro,17,833,9996,False,False,False,False,monthly,True
2,Enterprise,62,0,0,True,True,False,False,annual,False
3,Enterprise,5,995,11940,False,False,False,True,monthly,True
4,Enterprise,27,5373,64476,False,False,False,False,monthly,True


In [9]:
X = df_model.drop('churn_flag', axis=1)
y = df_model['churn_flag']

X.head(), y.head()

(    plan_tier  seats  mrr_amount  arr_amount  is_trial  upgrade_flag  \
 0  Enterprise     14        2786       33432     False         False   
 1         Pro     17         833        9996     False         False   
 2  Enterprise     62           0           0      True          True   
 3  Enterprise      5         995       11940     False         False   
 4  Enterprise     27        5373       64476     False         False   
 
    downgrade_flag billing_frequency  auto_renew_flag  
 0           False           monthly             True  
 1           False           monthly             True  
 2           False            annual            False  
 3           False           monthly             True  
 4           False           monthly             True  ,
 0     True
 1    False
 2    False
 3     True
 4    False
 Name: churn_flag, dtype: bool)

In [10]:
X.isnull().sum()

Unnamed: 0,0
plan_tier,0
seats,0
mrr_amount,0
arr_amount,0
is_trial,0
upgrade_flag,0
downgrade_flag,0
billing_frequency,0
auto_renew_flag,0


In [11]:
X.dtypes

Unnamed: 0,0
plan_tier,object
seats,int64
mrr_amount,int64
arr_amount,int64
is_trial,bool
upgrade_flag,bool
downgrade_flag,bool
billing_frequency,object
auto_renew_flag,bool


**4.FEATURE ENCODING:**  
Used one-hot encoding for plan tier & billing frequency

In [12]:
X_encoded = pd.get_dummies(
    X,
    columns=['plan_tier', 'billing_frequency'],
    drop_first=True
)

In [13]:
X_encoded.head()

Unnamed: 0,seats,mrr_amount,arr_amount,is_trial,upgrade_flag,downgrade_flag,auto_renew_flag,plan_tier_Enterprise,plan_tier_Pro,billing_frequency_monthly
0,14,2786,33432,False,False,False,True,True,False,True
1,17,833,9996,False,False,False,True,False,True,True
2,62,0,0,True,True,False,False,True,False,False
3,5,995,11940,False,False,False,True,True,False,True
4,27,5373,64476,False,False,False,True,True,False,True


In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)

In [15]:
y_train.value_counts()

Unnamed: 0_level_0,count
churn_flag,Unnamed: 1_level_1
False,3611
True,389


**5.MODEL TRAINING:**  
Logistic Regression
Class weight balanced

In [16]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [17]:
y_pred = model.predict(X_test)

In [18]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.90      1.00      0.95       903
        True       0.00      0.00      0.00        97

    accuracy                           0.90      1000
   macro avg       0.45      0.50      0.47      1000
weighted avg       0.82      0.90      0.86      1000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
from sklearn.linear_model import LogisticRegression

model_balanced = LogisticRegression(
    max_iter=1000,
    class_weight='balanced'
)

model_balanced.fit(X_train, y_train)

In [20]:
y_pred_balanced = model_balanced.predict(X_test)

**6.MODEL EVALUATION:**  
Accuracy  
Precision  
Recall  


In [21]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_balanced))

              precision    recall  f1-score   support

       False       0.89      0.52      0.65       903
        True       0.08      0.40      0.14        97

    accuracy                           0.51      1000
   macro avg       0.49      0.46      0.40      1000
weighted avg       0.81      0.51      0.60      1000



In [24]:
feature_columns = X_encoded.columns

**7.NEW CUSTOMER PREDICTION:**  
Model takes new customer details
Outputs renewal/cancellation + probability

In [26]:
import pandas as pd

def predict_customer_renewal(customer_data, model, feature_columns):
    # Convert input to DataFrame
    df = pd.DataFrame([customer_data])

    # One-hot encode categorical features
    df_encoded = pd.get_dummies(
        df,
        columns=['plan_tier', 'billing_frequency'],
        drop_first=True
    )

    # Align columns with training data
    df_encoded = df_encoded.reindex(columns=feature_columns, fill_value=0)

    # Predict
    prediction = model.predict(df_encoded)[0]
    probability = model.predict_proba(df_encoded)[0]

    if prediction == 1:
        result = "CANCEL"
    else:
        result = "RENEW"

    return {
        "Prediction": result,
        "Renew_Probability": round(probability[0], 2),
        "Cancel_Probability": round(probability[1], 2)
    }
    new_customer = {
    "seats": 12,
    "mrr_amount": 1500,
    "arr_amount": 18000,
    "is_trial": False,
    "upgrade_flag": False,
    "downgrade_flag": False,
    "auto_renew_flag": True,
    "plan_tier": "Pro",
    "billing_frequency": "monthly"
}
output = predict_customer_renewal(new_customer, model, feature_columns)
print(output)

{'Prediction': 'RENEW', 'Renew_Probability': np.float64(0.89), 'Cancel_Probability': np.float64(0.11)}
