# Library

In [None]:
import pandas as pd
import re
import string
from googletrans import Translator, LANGUAGES
import torch
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from torch import nn, optim
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from deep_translator import GoogleTranslator
from sklearn.model_selection import GridSearchCV

# Preprocessing

In [None]:
def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def preprocess_text(text):
    # Remove emojis
    text = remove_emojis(text)

    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenization
    tokens = text.split()

    # Remove stopwords
    stop_words = set(stopwords.words('portuguese')).union(set(stopwords.words('english')))
    tokens = [word for word in tokens if word.lower() not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return " ".join(lemmatized_tokens)

# Load your dataset
df = pd.read_csv("Dataset.csv")

# Load your dataset
df = pd.read_csv("Dataset.csv")

# Converting the 'comment' column to string
df['comment'] = df['comment'].astype(str)

# Applying the cleaning function
df['comment'] = df['comment'].apply(remove_emojis)

# Filtering comments with a maximum of 256 characters (after translation)
df = df[df['comment'].str.len() <= 256]

# Now your dataframe 'df' has clean comments, translated to English and with a maximum of 256 characters
df = df[["id", "comment", "churn"]]


In [None]:
# Split data into training and test sets
train_df, test_df = train_test_split(df, test_size=0.2)

# Load Tokenizer and BERT Model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model_bert = BertModel.from_pretrained('bert-base-uncased')

# Dataset Class
class ChurnDataset(Dataset):
    def __init__(self, comments, labels):
        self.comments = comments
        self.labels = labels

    def __len__(self):
        return len(self.comments)

    def __getitem__(self, idx):
        comment = self.comments[idx]
        label = self.labels[idx]
        encoding = tokenizer.encode_plus(
            comment,
            add_special_tokens=True,
            max_length=256,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'comment_text': comment,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# DataLoaders
train_dataset = ChurnDataset(train_df['comment'].to_numpy(), train_df['churn'].to_numpy())
test_dataset = ChurnDataset(test_df['comment'].to_numpy(), test_df['churn'].to_numpy())

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# ChurnClassifier Class
class ChurnClassifier(nn.Module):
    def __init__(self, bert_model):
        super(ChurnClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_output = self.sigmoid(linear_output)
        return final_output

# Initialize the Model
model = ChurnClassifier(model_bert)

# Training
optimizer = optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.BCELoss()

for epoch in range(3):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels.unsqueeze(1).float())
        loss.backward()
        optimizer.step()

# Function to extract embeddings from BERT
def extract_embeddings(dataloader, model):
    model.eval()
    embeddings = []
    labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            label = batch['labels']

            _, pooled_output = model(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
            embeddings.extend(pooled_output.detach().cpu().numpy())
            labels.extend(label.detach().cpu().numpy())

    return np.array(embeddings), np.array(labels)

# Extract embeddings for the training and test sets
train_embeddings, train_labels = extract_embeddings(train_loader, model_bert)
test_embeddings, test_labels = extract_embeddings(test_loader, model_bert)

# Modeling

### Random Forest

In [None]:

rf = RandomForestClassifier()

# Hyperparameter space for the search
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Define the scorer
scorer = make_scorer(f1_score, average='binary')  # or use scorer=make_scorer(accuracy_score) for accuracy

# Instantiate the GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, scoring=scorer, cv=5, verbose=2, n_jobs=-1)

# Fit the model with the dataset
grid_search.fit(train_embeddings, train_labels)

# Best parameters
print("Best hyperparameters:", grid_search.best_params_)

# Best score
print("Best score:", grid_search.best_score_)

### SVM

In [None]:
svm = SVC()

# Hyperparameter space for SVM search
param_grid_svm = {
    'C': [0.1, 1, 10],  # Regularization
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto'],
    'degree': [2, 3, 4]
}

# GridSearchCV for SVM
grid_search_svm = GridSearchCV(estimator=svm, param_grid=param_grid_svm, scoring=scorer, cv=5, verbose=2, n_jobs=-1)

# Fit the SVM model with the dataset
grid_search_svm.fit(train_embeddings, train_labels)

# Best parameters for SVM
print("Best hyperparameters for SVM:", grid_search_svm.best_params_)

# Best score for SVM
print("Best score for SVM:", grid_search_svm.best_score_)


### MLP

In [None]:
mlp = MLPClassifier()

# Hyperparameter space for MLP search
param_grid_mlp = {
    'hidden_layer_sizes': [(50, 50), (100, 100), (50, 100, 50)],  # Varied sizes for hidden layers
    'activation': ['tanh', 'relu', 'logistic'],  # Different activation functions
    'solver': ['sgd', 'adam', 'lbfgs'],  # Different algorithms for weight optimization
    'alpha': [0.0001, 0.001, 0.05],  # Varied values for L2 regularization term
    'learning_rate': ['constant', 'adaptive', 'invscaling']  # Different learning rate strategies
}

# Define the scorers
scoring = {
    'f1_macro': make_scorer(f1_score, average='macro'),
    'accuracy': 'accuracy',
}

#
grid_search_mlp = GridSearchCV(estimator=mlp, param_grid=param_grid_mlp, scoring=scoring, refit='f1_macro', cv=5, verbose=2, n_jobs=-1)

#
grid_search_mlp.fit(train_embeddings, train_labels)

#Show the results
print("Best hyperparameters for MLP:", grid_search_mlp.best_params_)
print("Best macro f1-score for MLP:", grid_search_mlp.best_score_)

print("Best accuracy for MLP:", grid_search_mlp.cv_results_['mean_test_accuracy'][grid_search_mlp.best_index_])
