<a href="https://colab.research.google.com/github/Saakshi05/ML-projects/blob/main/growthlinkml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Task 3: Customer Churn Prediction

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# Load dataset
file_path = "Churn_Modelling.csv"
df = pd.read_csv(file_path)

In [None]:
# Display initial data info
df_info = df.info()
df_head = df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Geography        10000 non-null  int64  
 2   Gender           10000 non-null  int64  
 3   Age              10000 non-null  int64  
 4   Tenure           10000 non-null  int64  
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  int64  
 7   HasCrCard        10000 non-null  int64  
 8   IsActiveMember   10000 non-null  int64  
 9   EstimatedSalary  10000 non-null  float64
 10  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9)
memory usage: 859.5 KB


In [None]:
# Check for missing values
missing_values = df.isnull().sum()

In [None]:
# Drop irrelevant columns
df.drop(["RowNumber", "CustomerId", "Surname"], axis=1, inplace=True)

In [None]:
# Encode categorical variables
df["Geography"] = LabelEncoder().fit_transform(df["Geography"])
df["Gender"] = LabelEncoder().fit_transform(df["Gender"])

In [None]:
# Split features and target variable
X = df.drop("Exited", axis=1)
y = df["Exited"]

In [None]:
# Split into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Initialize models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42)

}

In [None]:
# Store results
results = {}
feature_importances = {}

In [None]:
# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)

    results[name] = {"Accuracy": accuracy, "Confusion Matrix": conf_matrix, "Report": class_report}

    # Extract feature importance for Random Forest and XGBoost
    if name in ["Random Forest", "XGBoost"]:
        feature_importances[name] = model.feature_importances_

In [None]:
# Convert feature importance to DataFrame for visualization
feature_importance_df = pd.DataFrame({
    "Feature": X.columns,
    "Random Forest Importance": feature_importances["Random Forest"],
    "XGBoost Importance": feature_importances["XGBoost"],
})

In [None]:
# Print Dataset Information
print("\n===== Dataset Information =====")
print(df_info)

# Print First Few Rows of the Dataset
print("\n===== First 5 Rows of the Dataset =====")
print(df_head)

# Print Missing Values (if any)
print("\n===== Missing Values in Dataset =====")
print(missing_values)

# Print Model Performance Metrics
print("\n===== Model Performance =====")
for model_name, metrics in results.items():
    print(f"\n--- {model_name} ---")
    print(f"Accuracy: {metrics['Accuracy']:.4f}")
    print("Confusion Matrix:")
    print(metrics["Confusion Matrix"])
    print("Classification Report:")
    print(metrics["Report"])

# Print Feature Importance Analysis
print("\n===== Feature Importance (Top Factors Influencing Churn) =====")
print(feature_importance_df.to_string(index=False))



===== Dataset Information =====
None

===== First 5 Rows of the Dataset =====
   CreditScore  Geography  Gender  Age  Tenure    Balance  NumOfProducts  \
0          619          0       0   42       2       0.00              1   
1          608          2       0   41       1   83807.86              1   
2          502          0       0   42       8  159660.80              3   
3          699          0       0   39       1       0.00              2   
4          850          2       0   43       2  125510.82              1   

   HasCrCard  IsActiveMember  EstimatedSalary  Exited  
0          1               1        101348.88       1  
1          0               1        112542.58       0  
2          1               0        113931.57       1  
3          0               0         93826.63       0  
4          1               1         79084.10       0  

===== Missing Values in Dataset =====
CreditScore        0
Geography          0
Gender             0
Age                0
Tenur

Task 4: Spam SMS Detection

In [None]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
file_path = 'spam.csv'
df = pd.read_csv(file_path, encoding='latin-1')

# Data cleaning: Keep only necessary columns
df = df.iloc[:, :2]
df.columns = ['label', 'message']

df['label'] = df['label'].map({'ham': 0, 'spam': 1})  # Convert labels to binary

# Drop any rows with missing values
df = df.dropna()

# Function to clean text
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
        text = re.sub("\\d+", "", text)  # Remove numbers
        text = re.sub("\s+", " ", text).strip()  # Remove extra spaces
    return text

# Apply text cleaning
df['cleaned_message'] = df['message'].apply(clean_text)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_message'], df['label'], test_size=0.2, random_state=42, stratify=df['label'])

# Ensure no NaN values in target variable
y_train = y_train.dropna()
y_test = y_test.dropna()

# Text vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Models
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

# Train and evaluate models
print("\nA robust model that accurately distinguishes between spam and legitimate messages, with well-documented preprocessing and classification approaches.\n")
for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    print(f"\n{name} Performance:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))


A robust model that accurately distinguishes between spam and legitimate messages, with well-documented preprocessing and classification approaches.


Naive Bayes Performance:
Accuracy: 0.9623318385650225
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       0.99      0.72      0.84       149

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115


Logistic Regression Performance:
Accuracy: 0.9605381165919282
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       0.99      0.71      0.83       149

    accuracy                           0.96      1115
   macro avg       0.97      0.86      0.90      1115
weighted avg       0.96      0.96      0.96      1115


Random Forest Performance:
A