In [3]:
# import pandas
import pandas as pd 
#load data
df = pd.read_csv("data/Syria_Tel_data.csv")
df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [4]:
y = df["churn"]
X = df.drop("churn",axis = 1)

In [5]:
# split train and test dataset
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Define which columns are numerical and which are categorical
numerical_columns = ['account length', 'number vmail messages', 'total day minutes', 'total day calls', 'total day charge']
categorical_columns = ['state', 'voice mail plan', 'international plan']

# Create transformers for numerical and categorical columns
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(sparse_output=False, drop='first')  # drop the first category to avoid multicollinearity

# Use the ColumnTransformer to apply the transformers to the correct columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

# Fit and transform on the training data
X_train_scaled = preprocessor.fit_transform(X_train)
# Transform the test data
X_test_scaled = preprocessor.transform(X_test)


In [7]:
# Import the SMOTE library
from imblearn.over_sampling import SMOTE

# Instantiate SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to your feature data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

In [8]:
#instantiate logistic regression model
from sklearn.linear_model import LogisticRegression
logreg_2 = LogisticRegression(C= 1,penalty = "l1",solver = "liblinear")
#fit the model
model_2 = logreg_2.fit(X_train_scaled,y_train)
#predict
y_pred_2 = model_2.predict(X_test_scaled)

In [9]:
model_2.score(X_test_scaled,y_test)

0.8470764617691154

In [10]:
# classification report
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred_2))

              precision    recall  f1-score   support

       False       0.86      0.98      0.92       566
        True       0.48      0.10      0.16       101

    accuracy                           0.85       667
   macro avg       0.67      0.54      0.54       667
weighted avg       0.80      0.85      0.80       667



In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

# Define the hyperparameters to search over
criteria = ["gini", "entropy"]
max_depths = [1, 2, 3, 4, 5]
random_states = [42]

# Variables to store the best hyperparameters and accuracy
best_params = {}
best_accuracy = 0.0

# Iterate through hyperparameters
for criterion in criteria:
    for max_depth in max_depths:
        for random_state in random_states:
            # Instantiate DecisionTreeClassifier with specified hyperparameters
            tree = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth, random_state=random_state)
            
            # Fit the model (you need to have training data X_train and y_train)
            tree.fit(X_train_scaled, y_train)
            
            # Perform cross-validation and evaluate the model
            scores = cross_val_score(tree, X_train_scaled, y_train, cv=5, scoring='accuracy')
            mean_accuracy = scores.mean()
            
            if mean_accuracy > best_accuracy:
                best_accuracy = mean_accuracy
                best_params = {'Criterion': criterion, 'Max Depth': max_depth, 'Random State': random_state}

# Print the best hyperparameters and accuracy
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'Criterion': 'entropy', 'Max Depth': 3, 'Random State': 42}


In [14]:
import xgboost as xgb

# Define the best hyperparameters
best_hyperparameters = {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}

# Create an XGBoost classifier with the best hyperparameters
xgb_classifier = xgb.XGBClassifier(
    max_depth=best_hyperparameters['max_depth'],
    learning_rate=best_hyperparameters['learning_rate'],
    n_estimators=best_hyperparameters['n_estimators'],
    random_state=42
)

# Fit the model using X_train_scaled
xgb_classifier.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set (you should have X_test_scaled)
y_pred_xgb = xgb_classifier.predict(X_test_scaled)

# Evaluate the model
scores = cross_val_score(xgb_classifier, X_train_resampled, y_train_resampled, cv=5, scoring='accuracy')
mean_accuracy = scores.mean()
report = classification_report(y_test, y_pred_xgb)

print(f"Accuracy: {mean_accuracy:.2f}")
print("Classification Report:")
print(report)


Accuracy: 0.90
Classification Report:
              precision    recall  f1-score   support

       False       0.90      0.94      0.92       566
        True       0.53      0.41      0.46       101

    accuracy                           0.86       667
   macro avg       0.72      0.67      0.69       667
weighted avg       0.84      0.86      0.85       667



In [16]:
import numpy as np

# Calculate mean values for relevant columns from the original dataset
total_day_calls_1 = X_test['total day calls'].mean()
total_eve_calls_1 = X_test['total eve calls'].mean()
total_night_calls_1 = X_test['total night calls'].mean()
total_intl_calls_1 = X_test['total intl calls'].mean()

# Create a copy of your dataset
data_copy = X_test_scaled.copy()

# Define a dictionary mapping column names to their indices
feature_indices = {
    'total day charge': X_test.columns.get_loc('total day charge'),
    'total eve charge': X_test.columns.get_loc('total eve charge'),
    'total night charge': X_test.columns.get_loc('total night charge'),
    'total intl charge': X_test.columns.get_loc('total intl charge')
    # Add more columns if needed
}

# Define charge scenarios (e.g., vary day, evening, night, and international charges)
charge_scenarios = np.arange(0, 50, 5)  # Adjust the range and step as needed

# Initialize variables to track optimal charges and minimum churn probabilities
optimal_charges = {}
min_churn_prob = 1.0  # Initialize with a high value

# Define profit function
def calculate_profit(data, charges):
    total_revenue = (
        data[:, feature_indices['total day charge']] * total_day_calls_1 +
        data[:, feature_indices['total eve charge']] * total_eve_calls_1 +
        data[:, feature_indices['total night charge']] * total_night_calls_1 +
        data[:, feature_indices['total intl charge']] * total_intl_calls_1
    )
    return total_revenue - charges

# Loop through charge scenarios
for day_charge in charge_scenarios:
    for eve_charge in charge_scenarios:
        for night_charge in charge_scenarios:
            for intl_charge in charge_scenarios:
                # Create a copy of the original dataset to reset for each scenario
                scenario_data = data_copy.copy()

                # Set hypothetical charges
                scenario_data[:, feature_indices['total day charge']] = day_charge
                scenario_data[:, feature_indices['total eve charge']] = eve_charge
                scenario_data[:, feature_indices['total night charge']] = night_charge
                scenario_data[:, feature_indices['total intl charge']] = intl_charge

                # Predict churn probabilities using the XGBoost model
                churn_probs = xgb_classifier.predict_proba(scenario_data)[:, 1]

                # Calculate the mean churn probability
                mean_churn_prob = np.mean(churn_probs)

                # Calculate profit
                profit = calculate_profit(scenario_data, day_charge + eve_charge + night_charge + intl_charge)

                # Check if it's a new minimum churn probability and a profit > 0
                if mean_churn_prob < min_churn_prob and profit > 0:
                    min_churn_prob = mean_churn_prob
                    optimal_charges = {
                        'Day Charge': day_charge,
                        'Evening Charge': eve_charge,
                        'Night Charge': night_charge,
                        'Intl Charge': intl_charge,
                        'Min Churn Probability': min_churn_prob
                    }

# Print the optimal charges and minimum churn probability
print("Optimal Charges and Minimum Churn Probability:")
print(optimal_charges)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()