In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/restaurant-tips-dataset-analysis/tip.csv


In [2]:
df = pd.read_csv('/kaggle/input/restaurant-tips-dataset-analysis/tip.csv')
df.head()


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [3]:
print(f"Number of Rows: {df.shape[0]}")
print(f"Number of Columns (Attributes): {df.shape[1]}")


Number of Rows: 244
Number of Columns (Attributes): 7


In [4]:
for column in df.columns:
    col_type = df[column].dtype
    print(f"\nColumn: {column}")
    print(f"Type: {col_type}")

    # If column is non-numeric, print unique values
    if not pd.api.types.is_numeric_dtype(df[column]):
        unique_values = df[column].unique()
        print(f"Unique Values ({len(unique_values)}): {unique_values[:10]}")  # show only first 10


Column: total_bill
Type: float64

Column: tip
Type: float64

Column: sex
Type: object
Unique Values (2): ['Female' 'Male']

Column: smoker
Type: object
Unique Values (2): ['No' 'Yes']

Column: day
Type: object
Unique Values (4): ['Sun' 'Sat' 'Thur' 'Fri']

Column: time
Type: object
Unique Values (2): ['Dinner' 'Lunch']

Column: size
Type: int64


### Binary Columns 
Sex : Female - 0 , Male - 1
Smoker : Yes - 1 , No - 1

### Columns with more that one value 
Day : Sun - 0 , Sat - 1 , Thur - 2 , Friday - 3

Time : BreakFast - 0 , Lunch - 1 , Dinner - 0




In [5]:
import pandas as pd

# Example: Load CSV
# df = pd.read_csv('/kaggle/input/your-dataset-name/yourfile.csv')

# --- Step 1: Define custom mappings for binary columns ---
binary_mappings = {
    'sex': {'Female': 0, 'Male': 1},
    'smoker': {'No': 0, 'Yes': 1}
}

# --- Step 2: Label encode binary columns ---
for col, mapping in binary_mappings.items():
    if col in df.columns:
        df[col] = df[col].map(mapping)
        print(f"âœ… Label encoded binary column: {col}")

# --- Step 3: One-hot encode multi-category columns with 0/1 ---
multi_category_cols = ['day', 'time']  # Add others if needed

for col in multi_category_cols:
    if col in df.columns:
        dummies = pd.get_dummies(df[col], prefix=col, drop_first=False).astype(int)
        df = pd.concat([df, dummies], axis=1)
        df.drop(columns=[col], inplace=True)
        print(f"âœ… One-hot encoded multi-category column: {col} (with 0/1)")

# --- Step 4: Display result ---
print("\nâœ… Encoding completed successfully!")
display(df.head(7))  # Display first 7 rows


âœ… Label encoded binary column: sex
âœ… Label encoded binary column: smoker
âœ… One-hot encoded multi-category column: day (with 0/1)
âœ… One-hot encoded multi-category column: time (with 0/1)

âœ… Encoding completed successfully!


Unnamed: 0,total_bill,tip,sex,smoker,size,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,16.99,1.01,0,0,2,0,0,1,0,1,0
1,10.34,1.66,1,0,3,0,0,1,0,1,0
2,21.01,3.5,1,0,3,0,0,1,0,1,0
3,23.68,3.31,1,0,2,0,0,1,0,1,0
4,24.59,3.61,0,0,4,0,0,1,0,1,0
5,25.29,4.71,1,0,4,0,0,1,0,1,0
6,8.77,2.0,1,0,2,0,0,1,0,1,0


In [6]:
from sklearn.model_selection import train_test_split
# --- Step 1: Assume df is already preprocessed ---
# Columns: total_bill, tip, sex, smoker, size, day_Fri, day_Sat, day_Sun, day_Thur, time_Dinner, time_Lunch

# --- Step 2: Define Features (X) and Target (y) ---
X = df[['total_bill', 'sex', 'smoker', 'size',
        'day_Fri', 'day_Sat', 'day_Sun', 'day_Thur',
        'time_Dinner', 'time_Lunch']]

y = df['tip']

# --- Step 3: Split into Train/Test sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
from sklearn.linear_model import LinearRegression
# --- Step 4: Train a Linear Regression model ---
model = LinearRegression()
model.fit(X_train, y_train)

In [8]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# --- Step 5: Predict on Test set ---
y_pred = model.predict(X_test)

# --- Step 6: Evaluate the model ---
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print("\nðŸ“Š Regression Model Evaluation Metrics ðŸ“Š")
print("--------------------------------------------------")
print(f"Mean Absolute Error (MAE)       : {mae:.4f}")
print(f"Mean Squared Error (MSE)        : {mse:.4f}")
print(f"Root Mean Squared Error (RMSE)  : {rmse:.4f}")
print(f"RÂ² Score                        : {r2:.4f}")
print("--------------------------------------------------")


ðŸ“Š Regression Model Evaluation Metrics ðŸ“Š
--------------------------------------------------
Mean Absolute Error (MAE)       : 0.6671
Mean Squared Error (MSE)        : 0.7034
Root Mean Squared Error (RMSE)  : 0.8387
RÂ² Score                        : 0.4373
--------------------------------------------------
