In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegressionCV

In [2]:
data = pd.read_csv('unsupervised(elian).csv')
df = pd.DataFrame(data)

In [3]:
df.drop(columns='cluster',inplace=True)

In [4]:
df.head()

Unnamed: 0,price,availability_365,room_type,review_scores_rating,neighbourhood_group_cleansed
0,48.0,365,Private room,4.6,East Region
1,60.0,244,Private room,4.69,North-East Region
2,50.0,365,Private room,4.7,East Region
3,200.0,362,Entire home/apt,4.93,East Region
4,60.0,239,Private room,4.81,North-East Region


In [9]:
df = pd.get_dummies(df, columns=['room_type', 'neighbourhood_group_cleansed'], drop_first=True)

In [10]:
df.head()

Unnamed: 0,price,availability_365,review_scores_rating,room_type_Private room,neighbourhood_group_cleansed_North Region,neighbourhood_group_cleansed_North-East Region,neighbourhood_group_cleansed_West Region
0,48.0,365,4.6,1,0,0,0
1,60.0,244,4.69,1,0,1,0
2,50.0,365,4.7,1,0,0,0
3,200.0,362,4.93,0,0,0,0
4,60.0,239,4.81,1,0,1,0


In [13]:
df["estimated_revenue"] = df["price"]*df["availability_365"]

In [15]:
df["revenue_category"] = pd.qcut(df["estimated_revenue"], q=3, labels=["low","mid","high"])
print(df[df['availability_365'] != 365]['revenue_category'].value_counts())
df.head()

low     58
high    50
mid     43
Name: revenue_category, dtype: int64


Unnamed: 0,price,availability_365,review_scores_rating,room_type_Private room,neighbourhood_group_cleansed_North Region,neighbourhood_group_cleansed_North-East Region,neighbourhood_group_cleansed_West Region,estimated_revenue,revenue_category
0,48.0,365,4.6,1,0,0,0,17520.0,low
1,60.0,244,4.69,1,0,1,0,14640.0,low
2,50.0,365,4.7,1,0,0,0,18250.0,low
3,200.0,362,4.93,0,0,0,0,72400.0,high
4,60.0,239,4.81,1,0,1,0,14340.0,low


In [16]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['revenue_category_encoded'] = le.fit_transform(df['revenue_category'])


In [17]:
df.head()

Unnamed: 0,price,availability_365,review_scores_rating,room_type_Private room,neighbourhood_group_cleansed_North Region,neighbourhood_group_cleansed_North-East Region,neighbourhood_group_cleansed_West Region,estimated_revenue,revenue_category,revenue_category_encoded
0,48.0,365,4.6,1,0,0,0,17520.0,low,1
1,60.0,244,4.69,1,0,1,0,14640.0,low,1
2,50.0,365,4.7,1,0,0,0,18250.0,low,1
3,200.0,362,4.93,0,0,0,0,72400.0,high,0
4,60.0,239,4.81,1,0,1,0,14340.0,low,1


In [20]:
x = df.drop(['revenue_category', 'estimated_revenue', 'revenue_category_encoded'], axis=1)
y = df['revenue_category_encoded']


In [11]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [22]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

In [30]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
scaler = StandardScaler()
x = scaler.fit_transform(x)

In [31]:
model = LogisticRegressionCV(max_iter=1000) 
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

In [32]:
acc = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {acc:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Test Accuracy: 0.8421
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.85      0.88        13
           1       0.92      0.85      0.88        13
           2       0.71      0.83      0.77        12

    accuracy                           0.84        38
   macro avg       0.85      0.84      0.84        38
weighted avg       0.85      0.84      0.85        38



In [34]:
cv_scores = cross_val_score(model, x, y, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy (5-fold): {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

Cross-Validation Accuracy (5-fold): 0.8579 ± 0.0268
