In [9]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [48]:
# Load the data and inspect the first few rows
data_path = 'flipkart Mobiles.csv'
data = pd.read_csv(data_path)
data.head()

Unnamed: 0.1,Unnamed: 0,Title,Storage,Cameras,screen_size,Battery,processor,price
0,0,"SAMSUNG Galaxy F13 (Waterfall Blue, 64 GB)",4 GB RAM | 64 GB ROM | Expandable Upto 1 TB,50MP + 5MP + 2MP | 8MP Front Camera,16.76 cm (6.6 inch) Full HD+ Display,6000 mAh Lithium Ion Battery,Exynos 850 Processor,"₹9,499"
1,1,"SAMSUNG Galaxy F13 (Sunrise Copper, 64 GB)",4 GB RAM | 64 GB ROM | Expandable Upto 1 TB,50MP + 5MP + 2MP | 8MP Front Camera,16.76 cm (6.6 inch) Full HD+ Display,6000 mAh Lithium Ion Battery,Exynos 850 Processor,"₹9,499"
2,2,"APPLE iPhone 11 (White, 64 GB)",64 GB ROM,12MP + 12MP | 12MP Front Camera,15.49 cm (6.1 inch) Liquid Retina HD Display,A13 Bionic Chip Processor,Brand Warranty of 1 Year,"₹35,990"
3,3,IAIR Basic Feature Dual Sim Mobile Phone with ...,32 MB RAM | 32 MB ROM | Expandable Upto 32 GB,0.8MP + 0.8MP,4.5 cm (1.77 inch) Display,1200 mAh Battery,1 Year manufacturer warranty for device and 6 ...,₹887
4,4,"SAMSUNG Galaxy F13 (Nightsky Green, 64 GB)",4 GB RAM | 64 GB ROM | Expandable Upto 1 TB,50MP + 5MP + 2MP | 8MP Front Camera,16.76 cm (6.6 inch) Full HD+ Display,6000 mAh Lithium Ion Battery,Exynos 850 Processor,"₹9,499"


In [49]:
# Combine relevant columns to form descriptions
data['Description'] = data[['Storage', 'Cameras', 'screen_size', 'Battery', 'processor']].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)

In [50]:
# Convert price to numeric by removing any non-numeric characters
data['price'] = data['price'].replace('[\₹,]', '', regex=True).astype(float)

In [51]:
# Handle missing values
data = data.dropna(subset=['Title', 'price', 'Description'])

In [52]:
# Ensure all text data is in string format
data['Description'] = data['Description'].astype(str)

In [53]:
# Filter out rows with empty strings in 'Description'
data = data[data['Description'].str.strip() != '']

In [54]:
# Define a function to clean text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])  # Remove non-alphanumeric characters
    return text

In [55]:
# Apply the cleaning function
data['Description'] = data['Description'].apply(clean_text)

In [56]:
# Ensure there are no rows with just stop words
def is_meaningful(text):
    # Simple check: text length greater than a threshold (e.g., 3 words)
    return len(text.split()) > 3

In [57]:
data = data[data['Description'].apply(is_meaningful)]

In [58]:
# Encoding categorical features
label_encoder = LabelEncoder()
data['Title'] = label_encoder.fit_transform(data['Title'])

In [59]:
# Vectorizing text data
tfidf_vectorizer = TfidfVectorizer(max_features=500, stop_words='english')

In [60]:
# Fit and transform the description column
description_tfidf = tfidf_vectorizer.fit_transform(data['Description'])

In [61]:
# Combine all features into a single DataFrame
description_df = pd.DataFrame(description_tfidf.toarray(), columns=[f'desc_{i}' for i in range(description_tfidf.shape[1])])

features = pd.concat([data[['Title', 'price']], description_df], axis=1)

In [62]:
# Define target variable (for demonstration, we'll assume a dummy target variable)
data['Target'] = (data['price'] > data['price'].mean()).astype(int)  # Example target variable
target = data['Target']

In [63]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [64]:
# Train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [65]:
# Make predictions
y_pred = rf_model.predict(X_test)

In [66]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [68]:

accuracy

0.9940476190476191

In [69]:
report

'              precision    recall  f1-score   support\n\n           0       0.99      1.00      1.00       103\n           1       1.00      0.98      0.99        65\n\n    accuracy                           0.99       168\n   macro avg       1.00      0.99      0.99       168\nweighted avg       0.99      0.99      0.99       168\n'