<a href="https://colab.research.google.com/github/Radhin-Krishna-R/Customer-complaint-analysis/blob/main/customer_compliant_analysis_implementing_machine_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pandas as pd
import numpy as np

In [12]:
train_c=pd.read_csv('/content/train_cleaned.csv')
test_c=pd.read_csv('/content/test_cleaned.csv')

In [13]:
#Converte all negative days held to zero (it is the time taken by the authority that can't be negative
train_c.loc[train_c['Day']<0,'Day']=0
test_c.loc[test_c['Day']<0,'Day']=0

In [15]:
#change Consumer Disputed Column to 0 and 1(yes to 1, and no to 0)
train_c.loc[train_c['Consumer disputed?']=='Yes','Consumer disputed?']=1
train_c.loc[train_c['Consumer disputed?']=='No','Consumer disputed?']=0

In [16]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [17]:
train_c.head()

Unnamed: 0,Date received,Product,Issue,Company,State,ZIP code,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID,Day,Month,Year,Days_Held,Week_Received
0,2015-10-14,Credit reporting,Incorrect information on credit report,Equifax,GA,30134,Web,2015-10-14,Closed with explanation,Yes,0,1605653.0,14.0,10.0,2015.0,0.0,2.0
1,2015-04-26,Bank account or service,Deposits and withdrawals,Wells Fargo & Company,GA,319XX,Web,2015-04-26,Closed with explanation,Yes,1,1347613.0,26.0,4.0,2015.0,0.0,4.0
2,2013-12-20,Credit card,Other,Citibank,SC,29203,Phone,2014-01-03,Closed with non-monetary relief,Yes,0,640394.0,20.0,12.0,2013.0,14.0,3.0
3,2016-03-03,Debt collection,Disclosure verification of debt,"FAIR COLLECTIONS & OUTSOURCING, INC.",OH,43082,Referral,2016-03-04,Closed with explanation,Yes,0,1815134.0,3.0,3.0,2016.0,1.0,1.0
4,2015-01-30,Debt collection,Disclosure verification of debt,"HCFS Health Care Financial Services, Inc.",CA,90036,Web,2015-01-30,Closed with explanation,Yes,1,1218613.0,30.0,1.0,2015.0,0.0,5.0


In [18]:
# Assuming 'train_c' is your DataFrame
categorical_features = ['Product', 'Submitted via', 'Company response to consumer', 'Timely response?']
dependent_variable = 'Consumer disputed?'

# Drop unnecessary columns
train_c = train_c.drop(columns=['Date sent to company'])

# Create dummy variables for categorical features
train_dummies = pd.get_dummies(train_c, columns=categorical_features)

# Separate features and target variable
X = train_dummies.drop(columns=[dependent_variable])
y = train_dummies[dependent_variable]
y.fillna(0, inplace=True)
# Handle non-numeric columns if any
X = X.apply(pd.to_numeric, errors='coerce').fillna(0)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply PCA to retain up to 80% of the information
pca = PCA(n_components=0.80)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

y_train = y_train.astype(int)


In [None]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'K-Neighbors': KNeighborsClassifier(),
    'XGBoost': XGBClassifier()
}
results = {}
for name, model in models.items():
    # Train the model
    model.fit(X_train_pca, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test_pca)

    # Calculate test accuracy
    test_accuracy = accuracy_score(y_test, y_pred)

    # Perform cross-validation and calculate validation accuracy
    validation_accuracy = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy').mean()

    # Store the results
    results[name] = {
        'Test Accuracy': test_accuracy,
        'Validation Accuracy': validation_accuracy
    }


In [None]:
# Verify and fix the data type of 'Consumer disputed?' column
y_train = y_train.astype(int)  # Convert to integer type if it's not already

# Now proceed with model training
for name, model in models.items():
    # Train the model
    model.fit(X_train_pca, y_train)

    # ... rest of your code ...