In [1]:
import pandas as pd

In [3]:
df=pd.read_csv('datasets/drug_data.csv')

In [4]:
from sklearn.preprocessing import RobustScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Assuming df is your original DataFrame
text_feature = 'review'
numerical_features = ['sentiment_rate', 'prescription_count', 'acceptance_rate','usefulcount', 'positive_count','avg_rating','avg_sentiment','final_rate','scaled_sentiment','scaled_rating']
target = 'rating'

X_text = df[text_feature]
X_numerical = df[numerical_features]
y = df[target]

In [5]:
import numpy as np
X_numerical = X_numerical.replace([np.inf, -np.inf], np.nan)
mask = X_numerical.notna().all(axis=1)
X_numerical = X_numerical[mask]
X_text = X_text[mask]
y = y[mask]
for feature in numerical_features:
    lower_bound = X_numerical[feature].quantile(0.001)
    upper_bound = X_numerical[feature].quantile(0.999)
    X_numerical[feature] = X_numerical[feature].clip(lower_bound, upper_bound)

In [6]:
scaler = RobustScaler()
X_numerical_scaled = scaler.fit_transform(X_numerical)
tfidf = TfidfVectorizer(max_features=5000)
X_text_tfidf = tfidf.fit_transform(X_text)

In [7]:
X_combined = hstack([X_text_tfidf, X_numerical_scaled])
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

In [8]:
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report
lgb = LGBMClassifier(objective='multiclass', num_class=10, random_state=42)
lgb.fit(X_train, y_train)
y_pred = lgb.predict(X_test)
print(classification_report(y_test, y_pred))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.140981 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 156976
[LightGBM] [Info] Number of data points in the train set: 18638, number of used features: 3469
[LightGBM] [Info] Start training from score -2.012920
[LightGBM] [Info] Start training from score -3.408089
[LightGBM] [Info] Start training from score -3.380909
[LightGBM] [Info] Start training from score -3.604447
[LightGBM] [Info] Start training from score -3.164730
[LightGBM] [Info] Start training from score -3.281877
[LightGBM] [Info] Start training from score -2.868822
[LightGBM] [Info] Start training from score -2.132210
[LightGBM] [Info] Start training from score -1.781936
[LightGBM] [Info] Start training from score -1.053708
              precision    recall  f1-score   support

           1       0.87      0.98      0.92       635
           2       0.87      0.49      0.62       185
    

In [9]:
import joblib

# Save the trained model
joblib.dump(lgb, 'lgb_model.pkl')

# Save the TF-IDF vectorizer, scaler, and numerical feature names
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(numerical_features, 'numerical_features.pkl')

['numerical_features.pkl']

In [10]:
import os
os.getcwd()

'c:\\Users\\HP\\Desktop\\my_project'

In [12]:
import joblib
import pandas as pd
from scipy.sparse import hstack

def get_reviews_for_drug(df, drug_name):
    return df[df['drugname'] == drug_name]

def preprocess_reviews(reviews):
    reviews['review'] = reviews['review'].fillna('')  # Fill NaN with empty string
    reviews['review'] = reviews['review'].str.lower()  # Convert to lowercase
    return reviews

def transform_text_data(reviews, tfidf):
    return tfidf.transform(reviews['review'])

def combine_features(reviews, scaler, tfidf, numerical_features):
    X_numerical = reviews[numerical_features]  # Ensure the order matches
    X_numerical = X_numerical.replace([np.inf, -np.inf], np.nan).fillna(0)
    X_numerical_scaled = scaler.transform(X_numerical)
    X_text_tfidf = transform_text_data(reviews, tfidf)
    return hstack([X_text_tfidf, X_numerical_scaled])

def predict_drug_rating(drug_name):
    df = pd.read_csv('datasets/drug_data.csv')  # Replace with your dataset path

    # Load the saved model and preprocessing components
    lgb_model = joblib.load('models/lgb_model.pkl')
    tfidf_vectorizer = joblib.load('models/tfidf_vectorizer.pkl')
    scaler = joblib.load('models/scaler.pkl')
    numerical_features = joblib.load('models/numerical_features.pkl')

    # Get reviews for the specified drug
    reviews = get_reviews_for_drug(df, drug_name)

    # Preprocess the reviews
    reviews = preprocess_reviews(reviews)

    # Combine features
    X_combined = combine_features(reviews, scaler, tfidf_vectorizer, numerical_features)

    # Predict ratings
    predictions = lgb_model.predict(X_combined)

    # Return or print the predicted ratings
    return predictions

# Example usage:
drug_name = input('Enter the drug name: ')
predicted_ratings = predict_drug_rating(drug_name)
print(f'Predicted ratings for {drug_name}: {predicted_ratings[0]}')


Predicted ratings for Nitrofurantoin: 1


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews['review'] = reviews['review'].fillna('')  # Fill NaN with empty string
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews['review'] = reviews['review'].str.lower()  # Convert to lowercase
