In [None]:
# Import necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import mode

# Read the Data from the Given excel file.
train_data = pd.read_csv('/content/Consumer_Complaints_test.csv')
test_data = pd.read_csv('/content/Consumer_Complaints_test.csv')

# Check the data type for both data (test file and train file)
print(train_data.dtypes)
print(test_data.dtypes)

# Do missing value analysis and drop columns where more than 25% of data are missing
train_data = train_data.dropna(thresh=len(train_data)*0.75, axis=1)
test_data = test_data.dropna(thresh=len(test_data)*0.75, axis=1)

# Extracting Day, Month, and Year from Date Received Column and create new fields for a month, year, and day
train_data['Date received'] = pd.to_datetime(train_data['Date received'])
train_data['Day'] = train_data['Date received'].dt.day
train_data['Month'] = train_data['Date received'].dt.month
train_data['Year'] = train_data['Date received'].dt.year

test_data['Date received'] = pd.to_datetime(test_data['Date received'])
test_data['Day'] = test_data['Date received'].dt.day
test_data['Month'] = test_data['Date received'].dt.month
test_data['Year'] = test_data['Date received'].dt.year

# Calculate the Number of Days the Complaint was with the Company and create a new field as “Days held”
train_data['Days held'] = (pd.to_datetime(train_data['Date sent to company']) - train_data['Date received']).dt.days
test_data['Days held'] = (pd.to_datetime(test_data['Date sent to company']) - test_data['Date received']).dt.days

# Drop "Date Received","Date Sent to Company","ZIP Code", "Complaint ID" fields
train_data.drop(['Date received', 'Date sent to company', 'ZIP code', 'Complaint ID'], axis=1, inplace=True)
test_data.drop(['Date received', 'Date sent to company', 'ZIP code', 'Complaint ID'], axis=1, inplace=True)

# Imputing Null value in “State” by Mode
train_state_mode = mode(train_data['State']).mode[0]
train_data['State'].fillna(train_state_mode, inplace=True)

test_state_mode = mode(test_data['State']).mode[0]
test_data['State'].fillna(test_state_mode, inplace=True)

# With the help of the days we calculated above, create a new field 'Week_Received' where we calculate the week based on the day of receiving.
train_data['Week_Received'] = train_data['Day'].apply(lambda x: x // 7)
test_data['Week_Received'] = test_data['Day'].apply(lambda x: x // 7)

# Store data of disputed people into the “disputed_cons” variable for future tasks
disputed_cons_train = train_data[train_data["Consumer disputed?"] == "Yes"]
disputed_cons_test = test_data[test_data["Consumer disputed?"] == "Yes"]

# Convert all negative days held to zero (it is the time taken by the authority that can't be negative)
train_data.loc[train_data["Days held"] < 0, "Days held"] = 0
test_data.loc[test_data["Days held"] < 0, "Days held"] = 0

# Data visualization tasks go here...
sns.barplot(x='Product', y='Dispute', data=train_data)
plt.show()

In [None]:
# Check the data type for both data (test file and train file)
print(train_data.dtypes)
print(test_data.dtypes)

# Do missing value analysis and drop columns where more than 25% of data are missing
train_data = train_data.dropna(thresh=len(train_data)*0.75, axis=1)
test_data = test_data.dropna(thresh=len(test_data)*0.75, axis=1)

# Extracting Day, Month, and Year from Date Received Column and create new fields for a month, year, and day
train_data['Date received'] = pd.to_datetime(train_data['Date received'])
train_data['Day'] = train_data['Date received'].dt.day
train_data['Month'] = train_data['Date received'].dt.month
train_data['Year'] = train_data['Date received'].dt.year

test_data['Date received'] = pd.to_datetime(test_data['Date received'])
test_data['Day'] = test_data['Date received'].dt.day
test_data['Month'] = test_data['Date received'].dt.month
test_data['Year'] = test_data['Date received'].dt.year

# Calculate the Number of Days the Complaint was with the Company and create a new field as “Days held”
train_data['Days held'] = (pd.to_datetime(train_data['Date sent to company']) - train_data['Date received']).dt.days
test_data['Days held'] = (pd.to_datetime(test_data['Date sent to company']) - test_data['Date received']).dt.days

# Drop "Date Received","Date Sent to Company","ZIP Code", "Complaint ID" fields
train_data.drop(['Date received', 'Date sent to company', 'ZIP code', 'Complaint ID'], axis=1, inplace=True)
test_data.drop(['Date received', 'Date sent to company', 'ZIP code', 'Complaint ID'], axis=1, inplace=True)

# Imputing Null value in “State” by Mode
train_state_mode = mode(train_data['State']).mode[0]
train_data['State'].fillna(train_state_mode, inplace=True)

test_state_mode = mode(test_data['State']).mode[0]
test_data['State'].fillna(test_state_mode, inplace=True)

# With the help of the days we calculated above, create a new field 'Week_Received' where we calculate the week based on the day of receiving.
train_data['Week_Received'] = train_data['Day'].apply(lambda x: x // 7)
test_data['Week_Received'] = test_data['Day'].apply(lambda x: x // 7)

# Store data of disputed people into the “disputed_cons” variable for future tasks
disputed_cons_train = train_data[train_data["Consumer disputed?"] == "Yes"]
disputed_cons_test = test_data[test_data["Consumer disputed?"] == "Yes"]

# Convert all negative days held to zero (it is the time taken by the authority that can't be negative)
train_data.loc[train_data["Days held"] < 0, "Days held"] = 0
test_data.loc[test_data["Days held"] < 0, "Days held"] = 0



In [None]:
# Plot bar graph of the total no of disputes products-wise with the help of seaborn
plt.figure(figsize=(10,6))
sns.countplot(y='Product', data=disputed_cons)
plt.title('Total Number of Disputes Product-Wise')
plt.show()

# Plot bar graph of the total no of disputes with Top Issues by Highest Disputes, with the help of seaborn
top_issues = disputed_cons['Issue'].value_counts().nlargest(10).index
plt.figure(figsize=(10,6))
sns.countplot(y='Issue', data=disputed_cons[disputed_cons['Issue'].isin(top_issues)])
plt.title('Top Issues by Highest Disputes')
plt.show()

# Plot bar graph of the total no of disputes by State with Maximum Disputes
top_states = disputed_cons['State'].value_counts().nlargest(10).index
plt.figure(figsize=(10,6))
sns.countplot(y='State', data=disputed_cons[disputed_cons['State'].isin(top_states)])
plt.title('States with Maximum Disputes')
plt.show()