<a href="https://colab.research.google.com/github/Sanjana-Savadatti/IBM_Project/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import silhouette_score, precision_score, recall_score, f1_score, mean_squared_error, confusion_matrix

# Mount Google Drive
drive.mount('/content/drive')

# Load the dataset
file_path = '/content/drive/My Drive/CRM_Data.xlsx'  # Update the path
data = pd.read_excel(file_path)

# Correct misspelled 'gmail' in email addresses
def correct_gmail(email):
    return re.sub(r'\b(gmial|gmaill|gmai|gmal|gmaul|gmeil)\.com\b', 'gmail.com', str(email), flags=re.IGNORECASE)

if 'email' in data.columns:
    data['email'] = data['email'].str.lower().apply(correct_gmail)

# Fill missing names using email prefix
def extract_name_from_email(email):
    if pd.isna(email):
        return 'Unknown'
    return email.split('@')[0] if '@' in email else 'Unknown'

if 'name' in data.columns and 'email' in data.columns:
    data['name'] = data['name'].fillna(data['email'].apply(extract_name_from_email))

# Remove duplicates using TF-IDF & DBSCAN
data['combined'] = data.get('name', '').fillna('') + ' ' + data.get('email', '').fillna('')
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['combined'])

eps_values = np.linspace(0.3, 1.0, 5)
best_eps = None
best_score = -1
for eps in eps_values:
    dbscan = DBSCAN(eps=eps, min_samples=2, metric='cosine')
    clusters = dbscan.fit_predict(X)
    if len(set(clusters)) > 1:
        score = silhouette_score(X, clusters)
        if score > best_score:
            best_score = score
            best_eps = eps

data['cluster'] = DBSCAN(eps=best_eps, min_samples=2, metric='cosine').fit_predict(X)

# Missing Value Imputation with Linear Regression & KNN
if 'age' in data.columns:
    train_data = data[data['age'].notnull()]
    predict_data = data[data['age'].isnull()]

    X_train = pd.get_dummies(train_data.drop(columns=['age', 'combined', 'cluster'], errors='ignore'), drop_first=True)
    y_train = train_data['age']

    # Handle missing values in features before training
    imputer = SimpleImputer(strategy='mean')
    X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)

    model = LinearRegression()
    model.fit(X_train, y_train)

    X_predict = pd.get_dummies(predict_data.drop(columns=['age', 'combined', 'cluster'], errors='ignore'), drop_first=True)
    X_predict = X_predict.reindex(columns=X_train.columns, fill_value=0)
    X_predict = pd.DataFrame(imputer.transform(X_predict), columns=X_predict.columns)

    predict_data['age'] = model.predict(X_predict)

    data = pd.concat([train_data, predict_data])

knn_imputer = KNNImputer(n_neighbors=5)
data[data.select_dtypes(include=['float64', 'int64']).columns] = knn_imputer.fit_transform(
    data.select_dtypes(include=['float64', 'int64'])
)

# Anomaly Detection using Decision Tree & Logistic Regression
if 'valid' in data.columns:
    X = pd.get_dummies(data.drop(columns=['valid', 'combined', 'cluster'], errors='ignore'), drop_first=True)
    y = data['valid']

    # Handle missing values in features before training
    X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    dt_clf = DecisionTreeClassifier(max_depth=5, random_state=42)
    dt_clf.fit(X_train, y_train)
    dt_predictions = dt_clf.predict(X_test)

    log_reg = LogisticRegression()
    log_reg.fit(X_train, y_train)
    log_predictions = log_reg.predict(X_test)

    cm = confusion_matrix(y_test, dt_predictions)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

# Save the cleaned and enriched data
cleaned_file_path = '/content/drive/My Drive/cleaned_data.xlsx'
data.to_excel(cleaned_file_path, index=False)

Mounted at /content/drive


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  predict_data['age'] = model.predict(X_predict)
