In [8]:
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

# -------------------------------
# Create models folder
# -------------------------------
os.makedirs("models", exist_ok=True)

# ==========================================================
# 1️⃣ SENTIMENT ANALYSIS ON TWITTER DATA
# ==========================================================

print("\n========== Sentiment Analysis on Twitter Data ==========\n")

# Create synthetic Twitter-like dataset (you can replace this with real tweets later)
np.random.seed(42)

positive_templates = [
    "I love {}", "So happy with {}", "{} made my day.", "Fantastic experience with {}",
    "Absolutely recommend {}", "Feeling great about {}"
]
negative_templates = [
    "I hate {}", "Terrible experience with {}", "{} ruined my day.", "Very disappointed with {}",
    "Never using {} again.", "So upset about {}"
]
neutral_templates = [
    "{} is okay.", "It's about {}", "I have no opinion about {}", "{} was average.",
    "Neither good nor bad about {}"
]
entities = ["the new update", "this app", "customer service", "the product", "their support", "the event"]

def make_tweet(label):
    if label == "positive":
        return np.random.choice(positive_templates).format(np.random.choice(entities))
    elif label == "negative":
        return np.random.choice(negative_templates).format(np.random.choice(entities))
    else:
        return np.random.choice(neutral_templates).format(np.random.choice(entities))

n = 600
labels = np.random.choice(["positive", "negative", "neutral"], size=n, p=[0.4, 0.35, 0.25])
tweets = [make_tweet(l) for l in labels]

sent_df = pd.DataFrame({"text": tweets, "label": labels})
print("Sample Sentiment Data:\n", sent_df.head(), "\n")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    sent_df["text"], sent_df["label"], test_size=0.2, random_state=42, stratify=sent_df["label"]
)

# Build pipeline (TF-IDF + Logistic Regression)
sent_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=5000)),
    ("clf", LogisticRegression(max_iter=1000, solver='lbfgs'))
])

# Train model
sent_pipeline.fit(X_train, y_train)

# Evaluate
y_pred = sent_pipeline.predict(X_test)
print("=== Classification Report (Sentiment Analysis) ===")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Save model
joblib.dump(sent_pipeline, "models/sentiment_pipeline.joblib")
print("\n✅ Sentiment model saved as: models/sentiment_pipeline.joblib\n")

# ==========================================================
# 2️⃣ CUSTOMER CHURN PREDICTION
# ==========================================================

print("\n========== Customer Churn Prediction ==========\n")

# Create synthetic dataset for churn prediction
X_num, y = make_classification(
    n_samples=2000, n_features=10, n_informative=6, n_redundant=2,
    n_clusters_per_class=2, weights=[0.7,0.3], flip_y=0.03, random_state=42
)

num_cols = [f"num_{i}" for i in range(X_num.shape[1])]
churn_df = pd.DataFrame(X_num, columns=num_cols)

# Add synthetic categorical columns
churn_df["Contract"] = pd.cut(churn_df["num_0"], bins=3, labels=["Month-to-month", "One year", "Two year"])
churn_df["PaymentMethod"] = np.where(churn_df["num_1"] > 0, "Electronic check", "Mailed check")
churn_df["SeniorCitizen"] = (churn_df["num_2"] > 0.5).astype(int)
churn_df["tenure_months"] = (np.abs(churn_df["num_3"]) * 12).astype(int).clip(0, 72)
churn_df["Churn"] = np.where(y == 1, "Yes", "No")

print("Sample Churn Data:\n", churn_df.head(), "\n")

# Define target and features
target = "Churn"
categorical_features = ["Contract", "PaymentMethod", "SeniorCitizen"]
numerical_features = [c for c in churn_df.columns if c.startswith("num_")] + ["tenure_months"]

X = churn_df[categorical_features + numerical_features]



Sample Sentiment Data:
                             text     label
0  Absolutely recommend this app  positive
1          this app was average.   neutral
2   Never using the event again.  negative
3   Never using the event again.  negative
4           I love their support  positive 

=== Classification Report (Sentiment Analysis) ===
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00        39
     neutral       1.00      1.00      1.00        32
    positive       1.00      1.00      1.00        49

    accuracy                           1.00       120
   macro avg       1.00      1.00      1.00       120
weighted avg       1.00      1.00      1.00       120

Confusion Matrix:
 [[39  0  0]
 [ 0 32  0]
 [ 0  0 49]]

✅ Sentiment model saved as: models/sentiment_pipeline.joblib



Sample Churn Data:
       num_0     num_1     num_2     num_3     num_4     num_5     num_6  \
0 -0.913297 -0.048776 -1.263867 -1.919355 -1.208197  2.516587  1.5