In [3]:
### =================================================== ###
###           FINAL SCRIPT V3 (SIMPLIFIED)              ###
### =================================================== ###

# 1. IMPORT LIBRARIES
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import pickle

# 2. LOAD DATA
df = pd.read_csv('/content/House_Rent_Dataset.csv')
print("Step 1: Data loaded successfully.")

# 3. DATA CLEANING AND PREPROCESSING (New, Simpler Method)
# First, ensure the entire column is treated as text for cleaning
df['Floor'] = df['Floor'].astype(str).str.strip().str.lower()

# Replace all special text cases
df['Floor'] = df['Floor'].str.replace('ground', '0', regex=False)
df['Floor'] = df['Floor'].str.replace('upper basement', '-1', regex=False)
df['Floor'] = df['Floor'].str.replace('lower basement', '-2', regex=False)

# Handle the "X out of Y" format by splitting and taking the first part
df['Floor'] = df['Floor'].str.split(' out of ').str[0]

# Convert the fully cleaned column to a proper number
df['Floor'] = pd.to_numeric(df['Floor'])

# --- Step B: Convert all other text columns to numbers ---
df = pd.get_dummies(df, columns=['City', 'Furnishing Status', 'Area Type', 'Tenant Preferred', 'Point of Contact'], drop_first=True)
print("Step 2: Data cleaning complete.")

# 4. DEFINE FEATURES AND TARGET
X = df.drop(columns=['Rent', 'Posted On', 'Area Locality'])
y = df['Rent']

# 5. SPLIT DATA
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. TRAIN THE MODEL
print("Step 3: Training the model...")
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)
print("Model training complete!")

# 7. EVALUATE THE MODEL
y_pred = model.predict(X_test)
score = r2_score(y_test, y_pred)
print(f"\nModel Performance (R-squared score): {score:.2f}")

# 8. SAVE THE MODEL
with open('rent_predictor.pkl', 'wb') as file:
    pickle.dump(model, file)
print("\nStep 4: Model saved successfully as rent_predictor.pkl!")
# --- ADD THIS TO THE END OF YOUR COLAB SCRIPT ---

# After saving the model, also save the column order
model_columns = X.columns
with open('model_columns.pkl', 'wb') as file:
    pickle.dump(model_columns, file)

print("\nColumn order saved successfully as model_columns.pkl!")

Step 1: Data loaded successfully.
Step 2: Data cleaning complete.
Step 3: Training the model...
Model training complete!

Model Performance (R-squared score): 0.66

Step 4: Model saved successfully as rent_predictor.pkl!

Column order saved successfully as model_columns.pkl!
