In [53]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets.widgets.trait_types import date_from_json
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor


In [None]:
red_wine=pd.read_csv("/content/drive/MyDrive/winequality/winequality-red.csv")
white_wine=pd.read_csv("/content/drive/MyDrive/winequality/winequality-white.csv")
red_wine['wine_type']='red'
white_wine['wine_type']='white'
df=pd.concat([red_wine, white_wine],axis=0)

df.head()

In [None]:
df.isnull().sum()

In [None]:
df['quality'].unique()

In [None]:
df.info()

In [168]:
def find_zscore(dataframe, threshold=3):
    num_cols = dataframe.select_dtypes(exclude=['int64', 'object']).columns
    zscores = np.abs((dataframe[num_cols] - dataframe[num_cols].mean()) / dataframe[num_cols].std())
    outliers = (zscores > threshold).any(axis=1)
    return outliers


def find_IQR(features, threshold=1.5):
  Q1 = df[features].quantile(0.25)
  Q3 = df[features].quantile(0.75)
  IQR = Q3-Q1
  low_bound = Q1-threshold*IQR
  high_bound = Q3+threshold*IQR
  outliers = (df[features]>high_bound) | (df[features]< low_bound)
  return outliers

In [None]:
outliers_zscore = find_zscore(df)
print("Rows with outliers:")
print(df[outliers_zscore])

In [None]:
outliers_IQR = pd.DataFrame()

for column in df.columns:
    if df[column].dtype == 'float64':
        outliers = find_IQR(column)
        outliers_IQR = pd.concat([outliers_IQR, outliers], axis=1)

print("Rows with outliers:")
print(outliers_IQR)

In [None]:
for column in df.columns:
    if df[column].dtype == 'float64':
        plt.figure(figsize=(8, 5))
        plt.boxplot([df[column][outliers_zscore], df[column][~outliers_zscore]], labels=['Outliers', 'Non-Outliers'])
        plt.title(f'Box Plot of {column}')
        plt.ylabel(column)
        plt.show()

In [None]:
df_cleaned = df[~outliers_zscore]
df_temp=df_cleaned.copy()
df_cleaned.info()
df_cleaned.head()

In [None]:
df_cleaned.isnull().sum()

In [None]:


correlation_matrix = df_cleaned.corr()

plt.figure(figsize=(10,8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()

In [None]:
sns.pairplot(df_cleaned, diag_kind='kde')
plt.show()

In [None]:
features=[col for col in df_cleaned if col !='quality']

for feat in features:
  plt.figure(figsize=(15,8))
  df_cleaned.groupby(feat)['quality'].mean().plot(kind='bar', color='green')
  plt.title(f'variation of {feat} with the quality')
  plt.ylabel(feat)
  plt.xlabel('Quality scale')
  plt.xticks(rotation=90)
  plt.show()

In [None]:
target_variable = 'quality'
for feature in df.columns:
    if feature != target_variable and df[feature].dtype == 'float64':
        plt.figure(figsize=(12, 5))
        plt.scatter(df[feature], df[target_variable], marker='o', alpha=0.5)
        plt.title(f'Line Plot of {feature} vs. {target_variable}')
        plt.xlabel(feature)
        plt.ylabel(target_variable)
        plt.grid(True)
        plt.show()

In [None]:
df_cleaned.describe()

In [None]:
label_encoder = LabelEncoder()
df_cleaned['wine_type'] = label_encoder.fit_transform(df_cleaned['wine_type'])
print("Unique labels:", label_encoder.classes_)


In [None]:
plt.hist(df_cleaned, bins=10)
plt.title('Histogram of Data')
plt.xlabel('Values')
plt.ylabel('Frequency')
plt.show()

In [194]:
epsilon = 1e-10
transformed_data = np.log(df_cleaned + epsilon)
transformed_data = np.sqrt(df_cleaned)

In [None]:
plt.hist(transformed_data, bins=10)
plt.title('Histogram of Data')
plt.xlabel('Values')
plt.ylabel('Frequency')
plt.show()

In [None]:
df_cleaned.isnull().sum()

In [None]:
column_name = 'quality'

non_numeric_values = df_cleaned[~df_cleaned[column_name].apply(lambda x: isinstance(x, (int, float)))]
count_non_numeric = non_numeric_values.shape[0]

print(f"Non-numeric values in '{column_name}':")
print(non_numeric_values)
print(f"Count of non-numeric values in '{column_name}':", count_non_numeric)


In [199]:
scaler = MinMaxScaler()

features_to_scale = df_cleaned.drop(columns=['quality']).select_dtypes(include=['float64']).columns

df_scaled = df_cleaned.copy()
df_scaled[features_to_scale] = scaler.fit_transform(df_cleaned[features_to_scale])



In [None]:
df_scaled.info()

In [256]:
df = df_scaled
df = df.drop('wine_type', axis=1)


In [None]:
X=df.iloc[:,:-1]
Y=df.iloc[:,-1]
print(X.shape)
Y.shape
X.columns

In [258]:
Y.isnull().sum()

0

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA(n_components=10)

X_reduced = pca.fit_transform(X_scaled)
X_reduced.shape

In [None]:
X_reduced.shape

In [246]:
X_reduced=pd.DataFrame(X_reduced)
X_reduced.head()
X=X_reduced

In [259]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=50)

In [None]:
Y_train.isnull().sum()

In [None]:
model =  LinearRegression()
model.fit(X_train, Y_train)

In [261]:
y_pred = model.predict(X_test)

In [None]:
mse = mean_squared_error(Y_test, y_pred)
r2 = r2_score(Y_test, y_pred)

print("Mean sqaured error: ",mse)
print("r2 score: ",r2)

In [None]:
plt.scatter(Y_test, y_pred)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs. Predicted Values")
plt.show()

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, Y_train)

In [270]:
y_pred = model.predict(X_test)

mse = mean_squared_error(Y_test, y_pred)
r2 = r2_score(Y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared (R2) Score:", r2)

Mean Squared Error: 0.35489134775374376
R-squared (R2) Score: 0.5174259513161064


In [None]:
feature_importance = model.feature_importances_
plt.figure(figsize=(10, 6))
plt.barh(X.columns, model.feature_importances_)
plt.xlabel("Feature Importance")
plt.ylabel("Features")
plt.title("Random Forest Feature Importance")
plt.show()

In [None]:
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, Y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(Y_test, y_pred)
r2 = r2_score(Y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared (R2) Score:", r2)
