In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score



In [2]:

# Load the dataset
df = pd.read_csv('your_dataset_with_churn_label.csv')

In [22]:
df.head(5)

Unnamed: 0,CUST_ID,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE,CHURN
0,0,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0,2,1000.0,201.802084,139.509787,0.0,12,False
1,1,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4,0,7000.0,4103.032597,1072.340217,0.222222,12,False
2,2,2495.148862,1.0,773.17,773.17,0.0,0.0,1.0,1.0,0.0,0.0,0,12,7500.0,622.066742,627.284787,0.0,12,False
3,3,1666.670542,0.636364,1499.0,1499.0,0.0,205.788017,0.083333,0.083333,0.0,0.083333,1,1,7500.0,0.0,,0.0,12,False
4,4,817.714335,1.0,16.0,16.0,0.0,0.0,0.083333,0.083333,0.0,0.0,0,1,1200.0,678.334763,244.791237,0.0,12,False


In [4]:
# Assuming we have a 'CHURN' column for this example, but if not, we'll create a dummy churn based on random conditions
# Here we're creating a dummy 'CHURN' column based on 'BALANCE' and 'PURCHASES'
df['CHURN'] = (df['BALANCE'] > 5000) & (df['PURCHASES'] < 1000)  # Example condition for churn

In [5]:
# Convert 'Customer_ID' to numeric if it exists
if 'CUST_ID' in df.columns:
    le = LabelEncoder()
    df['CUST_ID'] = le.fit_transform(df['CUST_ID'])


In [6]:
# Convert other categorical columns to numeric using one-hot encoding if needed
df = pd.get_dummies(df)


In [7]:
# Separate the features and the target variable
X = df.drop('CHURN', axis=1)  # Features
y = df['CHURN']               # Target variable

In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [9]:
# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [10]:
# Train the model
rf_classifier.fit(X_train, y_train)

In [11]:

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)


In [12]:
# Calculate accuracy parameters
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [13]:
# Print the accuracy parameters
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1 Score: {:.2f}%".format(f1 * 100))

Accuracy: 99.89%
Precision: 100.00%
Recall: 97.50%
F1 Score: 98.73%


In [14]:
# Print the classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

       False       1.00      1.00      1.00      2565
        True       1.00      0.97      0.99       120

    accuracy                           1.00      2685
   macro avg       1.00      0.99      0.99      2685
weighted avg       1.00      1.00      1.00      2685



In [15]:
# Print the confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))



Confusion Matrix:
[[2565    0]
 [   3  117]]


In [16]:
# Identify customers who are at risk of churn
at_risk_customers = df[df['CHURN'] == 1]


In [17]:

# Create a list of hypothetical purchase locations
locations = ['Supermarket', 'Gas Station', 'Online Shopping', 'Restaurant', 'Pharmacy', 'Mall']

# Randomly assign a location to each row in the DataFrame
np.random.seed(42)  # For reproducibility
at_risk_customers.loc[:, 'PURCHASES_LOCATION'] = np.random.choice(locations, size=len(at_risk_customers))

# # Display the first few rows to verify
# print(df[['CUST_ID', 'PURCHASES', 'PURCHASES_LOCATION']].head())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at_risk_customers.loc[:, 'PURCHASES_LOCATION'] = np.random.choice(locations, size=len(at_risk_customers))


In [18]:
# Check if 'PURCHASES_LOCATION' exists in the DataFrame
if 'PURCHASES_LOCATION' in at_risk_customers.columns:
    # Analyze spending patterns of at-risk customers including 'PURCHASES_LOCATION'
    top_spenders = at_risk_customers[['CUST_ID', 'BALANCE', 'PURCHASES', 'PURCHASES_LOCATION']].sort_values(by='PURCHASES', ascending=False)
    print(top_spenders)

 #Reward system suggestion based on frequent spending locations
    def suggest_rewards(row):
        if row['PURCHASES_LOCATION'] == 'Supermarket':
            return "Offer discount on groceries"
        elif row['PURCHASES_LOCATION'] == 'Gas Station':
            return "Provide fuel cashback reward"
        elif row['PURCHASES_LOCATION'] == 'Online Shopping':
            return "Offer free shipping on next order"
        elif row['PURCHASES_LOCATION'] == 'Restaurant':
            return "Provide dining discount"
        else:
            return "Offer general cashback reward"

    top_spenders['REWARD_SUGGESTION'] = top_spenders.apply(suggest_rewards, axis=1)

else:
    # If 'PURCHASES_LOCATION' does not exist, analyze only 'BALANCE' and 'PURCHASES'
    top_spenders = at_risk_customers[['CUST_ID', 'BALANCE', 'PURCHASES']].sort_values(by='PURCHASES', ascending=False)

    # Default reward suggestion if no location data is available
    def suggest_rewards(row):
        if row['PURCHASES'] > 1000:
            return "Offer discount on next purchase"
        elif row['BALANCE'] > 3000:
            return "Provide cashback reward"
        else:
            return "Offer free membership renewal"

    top_spenders['REWARD_SUGGESTION'] = top_spenders.apply(suggest_rewards, axis=1)

      CUST_ID       BALANCE  PURCHASES PURCHASES_LOCATION
6781     6781   6767.138783     999.97        Gas Station
3024     3024   6995.302500     999.77    Online Shopping
2878     2878  12323.845360     989.32    Online Shopping
207       207   7201.736985     987.10               Mall
2349     2349   8356.277203     983.24         Restaurant
...       ...           ...        ...                ...
3496     3496   5998.431562       0.00        Gas Station
3501     3501   9993.352521       0.00        Supermarket
3532     3532   5944.084692       0.00         Restaurant
3556     3556   6930.149580       0.00        Supermarket
24         24   5368.571219       0.00         Restaurant

[426 rows x 4 columns]


In [23]:
# Display reward suggestions
print("\nReward Suggestions for At-Risk Customers:")
print(top_spenders[['CUST_ID', 'PURCHASES', 'REWARD_SUGGESTION']].head(5))



Reward Suggestions for At-Risk Customers:
      CUST_ID  PURCHASES                  REWARD_SUGGESTION
6781     6781     999.97       Provide fuel cashback reward
3024     3024     999.77  Offer free shipping on next order
2878     2878     989.32  Offer free shipping on next order
207       207     987.10      Offer general cashback reward
2349     2349     983.24            Provide dining discount


In [20]:
# Save reward suggestions to a CSV file
top_spenders[['CUST_ID', 'PURCHASES', 'REWARD_SUGGESTION']].to_csv('reward_suggestions.csv', index=False)