In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from category_encoders import CatBoostEncoder

In [46]:
df = pd.read_csv("ramen_ratings.csv")

In [47]:
df.set_index("Review #", inplace=True)
df = df.dropna(subset=["Style", "Country", "Brand"])

# Top Ten column will show whether the item is in top ten
df.loc[df["Top Ten"].notna(), "Top Ten"] = 1
df.loc[df["Top Ten"].isna(), "Top Ten"] = 0

# Drop unrated rows and change data types
df = df[df["Stars"] != "Unrated"]
df["Stars"] = df["Stars"].astype(float)
df["Top Ten"] = df["Top Ten"].astype(int)

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 2575 entries, 2580 to 1
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Brand    2575 non-null   object 
 1   Variety  2575 non-null   object 
 2   Style    2575 non-null   object 
 3   Country  2575 non-null   object 
 4   Stars    2575 non-null   float64
 5   Top Ten  2575 non-null   int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 140.8+ KB


Unnamed: 0_level_0,Brand,Variety,Style,Country,Stars,Top Ten
Review #,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2580,New Touch,T's Restaurant Tantanmen,Cup,Japan,3.75,0
2579,Just Way,Noodles Spicy Hot Sesame Spicy Hot Sesame Guan...,Pack,Taiwan,1.0,0
2578,Nissin,Cup Noodles Chicken Vegetable,Cup,USA,2.25,0
2577,Wei Lih,GGE Ramen Snack Tomato Flavor,Pack,Taiwan,2.75,0
2576,Ching's Secret,Singapore Curry,Pack,India,3.75,0


In [48]:
X = df[["Brand", "Variety", "Country", "Top Ten", "Style"]]
y = df["Stars"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [49]:
catboost_cols = ["Brand", "Variety", "Country"]
onehot_cols = ["Style"]

In [50]:
catboost_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("encoder", CatBoostEncoder()),
    ]
)

onehot_pipeline = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor = ColumnTransformer(
    [
        ("catboost", catboost_pipeline, catboost_cols),
        ("onehot", onehot_pipeline, onehot_cols),
        ("passthrough", "passthrough", ["Top Ten"]),
    ]
)

model = Pipeline(
    [("preprocessing", preprocessor), ("model", RandomForestRegressor(random_state=42))]
)

In [51]:
model.fit(X_train, y_train)

In [52]:
y_pred = model.predict(X_test)

error = mean_squared_error(y_test, y_pred) ** (1 / 2)
error

0.9752665414720864

In [53]:
# Calculate the importance of features
feature_names = model.named_steps["preprocessing"].get_feature_names_out()

importances = model.named_steps["model"].feature_importances_

feat_importance_df = pd.DataFrame(
    {"Feature": feature_names, "Importance": importances}
).sort_values(by="Importance", ascending=False)

feat_importance_df

Unnamed: 0,Feature,Importance
2,catboost__2,0.496772
0,catboost__0,0.391316
8,onehot__Style_Pack,0.024234
4,onehot__Style_Bowl,0.021018
7,onehot__Style_Cup,0.02067
1,catboost__1,0.020244
10,passthrough__Top Ten,0.013196
9,onehot__Style_Tray,0.009788
5,onehot__Style_Box,0.001472
3,onehot__Style_Bar,0.001256


In [None]:
import boto3

# Replace with your actual credentials and info
bucket_name = "dataminds-homeworks"
s3_file_key = "nihat-mammadli-fe2.ipynb"
local_file_path = "nihat-mammadli-fe2.ipynb"

# Create an S3 client
s3 = boto3.client("s3")

# Upload the file
try:
    s3.upload_file(local_file_path, bucket_name, s3_file_key)
    print(f"File uploaded successfully to s3://{bucket_name}/{s3_file_key}")
except Exception as e:
    print("Error uploading file:", e)