In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
train_data = pd.read_csv(r"C:\Users\shiva\OneDrive\Desktop\Data Science and machine Learning\Train.csv")

In [None]:
print(train_data)

In [None]:
nan_counts = train_data.isna().sum()
print(nan_counts)

In [None]:
unique_count = train_data.apply(lambda x:len(x.unique()))
print(unique_count)

In [None]:
plt.figure(figsize = (8,6))
plt.subplot(211)
sns.boxplot(x='Item_Weight', data = train_data)
plt.show()

In [None]:
default_mean_weight = train_data['Item_Weight'].mean()
item_avg_weight = train_data.pivot_table(values='Item_Weight',index='Item_Identifier')
missing_values = train_data['Item_Weight'].isnull()
print('Missing values: %d' %sum(missing_values))
train_data.loc[missing_values,'Item_Weight'] = train_data.loc[missing_values,'Item_Identifier'].apply(lambda x: item_avg_weight.at[x,'Item_Weight'] if x in item_avg_weight.index else default_mean_weight)  
print('Missing values after imputation: %d' %sum(train_data['Item_Weight'].isnull()))

In [None]:
outlet_missing_values = train_data['Outlet_Size'].isnull()
print('Missing values: %d' %sum(outlet_missing_values))
mode_outlet_size = train_data['Outlet_Size'].mode()[0]
train_data['Outlet_Size'].fillna(mode_outlet_size,inplace=True)

In [None]:
print('Missing values: %d' %sum(train_data['Outlet_Size'].isnull()))

In [None]:
train_data['Outlet_Size']


In [None]:
mean_visibility = train_data['Item_Visibility'].replace(0,pd.NA).mean()
train_data['Item_Visibility'] = train_data['Item_Visibility'].replace(0,mean_visibility)

In [None]:
train_data['Item_Visibility'].head(10)

In [None]:
train_data['Item_Category'] = train_data['Item_Identifier'].apply(lambda x:x[:2])
category_mapping = {
    'FD' : 'Food',
    'DR' : 'Drinks',
    'NC' : 'Non-Consumables'
}
train_data['Item_Category'] = train_data['Item_Category'].map(category_mapping)

In [None]:
fat_content_count = train_data['Item_Fat_Content'].value_counts()
print(fat_content_count)

In [None]:
train_data['Item_Fat_Content'] = train_data['Item_Fat_Content'].replace({'LF' : 'Low Fat','reg' : 'Regular','low fat' : 'Low Fat'})
train_data['Item_Fat_Content'].value_counts()


In [None]:
plt.figure(figsize=(10,6))
plt.subplot(211)
sns.boxplot(x='Item_Category',y='Item_Outlet_Sales',data=train_data)
plt.subplot(212)
sns.boxplot(x='Item_Fat_Content',y='Item_Outlet_Sales',data=train_data)
plt.subplots_adjust(wspace = 0.2, hspace = 0.4,top = 1.5)
plt.xticks(rotation=45)
plt.show()


In [None]:
plt.figure(figsize=(10,9))
plt.subplot(311)
sns.boxplot(x='Outlet_Size',y='Item_Outlet_Sales',data=train_data)
plt.subplot(312)
sns.boxplot(x='Outlet_Location_Type',y='Item_Outlet_Sales',data=train_data)
plt.subplot(313)
sns.boxplot(x='Outlet_Type',y='Item_Outlet_Sales',data=train_data)
plt.subplots_adjust(wspace = 0.2, hspace = 0.4,top = 1.5)
plt.show()

In [None]:
train_data.groupby('Outlet_Establishment_Year')['Item_Outlet_Sales'].mean().plot.bar(color='green')

In [None]:
import random
sales_by_year = train_data.groupby('Outlet_Establishment_Year')['Item_Outlet_Sales'].mean()

# Define a list of colors for each bar (e.g., blue, green, red, yellow)
num_bar = sales_by_year.nunique()
colors = colors = [(random.random(), random.random(), random.random()) for _ in range(num_bars)]

# Create a bar plot with different colors for each bar
plt.figure(figsize=(10, 6))
sales_by_year.plot(kind='bar', color=colors)
plt.xlabel('Outlet Establishment Year')
plt.ylabel('Average Sales')
plt.title('Average Sales by Outlet Establishment Year')
plt.xticks(rotation=45)
plt.show()

In [None]:
train_data['Outlet_Years'] = 2009 - train_data['Outlet_Establishment_Year']
train_data['Outlet_Years'].head(10)
train_data['Outlet_Years'].describe()


In [None]:
plt.figure(figsize=(12,6))
ax = sns.barplot(x='Outlet_Years',y='Item_Outlet_Sales',data=train_data)
ax.set_title('Outlet years vs Item_Outlet_Sales')


In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_data['Outlet'] = le.fit_transform(train_data['Outlet_Identifier'])
lis_col = ['Item_Fat_Content','Outlet_Location_Type','Outlet_Size','Item_Category','Outlet_Type','Outlet']
for i in lis_col:
    train_data[i] = le.fit_transform(train_data[i])

In [None]:
train_data = pd.get_dummies(train_data, columns=['Item_Fat_Content','Outlet_Location_Type','Outlet_Size','Outlet_Type',
                              'Item_Category','Outlet'])



In [None]:
train_data.dtypes

In [None]:
from sklearn.model_selection import train_test_split

X = train_data.drop(columns=['Item_Outlet_Sales'])
y = train_data['Item_Outlet_Sales']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100,random_state=42)
rf.fit(X_train,y_train)

feature_importances = rf.feature_importances_

# Visualize feature importances
plt.figure(figsize=(10, 6))
plt.barh(range(len(feature_importances)), feature_importances, align='center')
plt.yticks(range(len(feature_importances)), X_train.columns)
plt.xlabel('Feature Importance')
plt.title('Feature Importance Analysis')
plt.show()


In [None]:
X_train

In [None]:
train_data.drop(['Item_Type','Outlet_Establishment_Year'],axis=1, inplace=True)

In [None]:
X_train.drop(['Item_Identifier'],axis=1, inplace=True)

In [None]:
X_train

In [None]:
X_test.drop(['Item_Identifier','Item_Type','Outlet_Establishment_Year',],axis=1, inplace=True)

In [None]:
X_train.drop(['Outlet_Identifier'],axis=1, inplace=True)
X_test.drop(['Outlet_Identifier'],axis=1, inplace=True)

In [None]:
X_train

In [None]:
X_test

In [None]:
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)

# Make predictions
y_pred = rf_regressor.predict(X_test)




In [None]:
print(y_pred)

In [None]:
plt.scatter(y_test, y_pred)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs. Predicted Values")
plt.show()

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R-squared:", r2_score(y_test, y_pred))

In [None]:
residuals = y_test - y_pred

plt.figure(figsize=(8, 6))
plt.scatter(y_test, residuals, alpha=0.5)
plt.xlabel("Actual Values")
plt.ylabel("Residuals (Actual - Predicted)")
plt.title("Residual Plot")
plt.axhline(y=0, color='r', linestyle='--')  # Add a horizontal line at y=0 for reference
plt.show()

In [None]:
y_test