In [None]:
# Data Processing
import pandas as pd
import numpy as np

# Visualisation
import matplotlib.pyplot as plt

# Load the training data
train_data = pd.read_csv('drugsComTrain_raw.tsv', sep='\t')

# Load the test data
test_data = pd.read_csv('drugsComTest_raw.tsv', sep='\t')

# Display the first few rows of each dataset
print("Training Data:")
print(train_data.head())

print("\nTest Data:")
print(test_data.head())

# Optional: Display basic info about the datasets
print("\nTraining Data Info:")
print(train_data.info())

print("\nTest Data Info:")
print(test_data.info())

Training Data:
   Unnamed: 0                  drugName                     condition  \
0      206461                 Valsartan  Left Ventricular Dysfunction   
1       95260                Guanfacine                          ADHD   
2       92703                    Lybrel                 Birth Control   
3      138000                Ortho Evra                 Birth Control   
4       35696  Buprenorphine / naloxone             Opiate Dependence   

                                              review  rating  \
0  "It has no side effect, I take it in combinati...     9.0   
1  "My son is halfway through his fourth week of ...     8.0   
2  "I used to take another oral contraceptive, wh...     5.0   
3  "This is my first time using any form of birth...     8.0   
4  "Suboxone has completely turned my life around...     9.0   

                date  usefulCount  
0       May 20, 2012           27  
1     April 27, 2010          192  
2  December 14, 2009           17  
3   November 3, 2

#### **Training Data**

- **Total Entries**: 161,297 rows
    
- **Columns**:
    
    1. **`Unnamed: 0`**: An index-like column that likely corresponds to the original dataset's row numbers.
    2. **`drugName`**: The name of the drug being reviewed (e.g., "Valsartan").
    3. **`condition`**: The medical condition for which the drug was prescribed (e.g., "ADHD"). Note that some values are missing in this column.
    4. **`review`**: A text review from the patient about the drug.
    5. **`rating`**: A numerical score (out of 10) reflecting the patient’s satisfaction with the drug.
    6. **`date`**: The date the review was written.
    7. **`usefulCount`**: The number of users who found the review helpful.
- **Key Observations**:
    
    - The dataset has missing values in the `condition` column.
    - `review` is textual data, suitable for sentiment analysis or text feature extraction.
    - `rating` is the target variable for regression or classification tasks.

#### **Test Data**

- **Total Entries**: 53,766 rows
- **Columns**: The same as the training data.
- **Key Differences**:
    - Smaller size compared to the training data.
    - The `condition` column also has missing values here.

In [17]:
# Convert training data
train_data.to_csv('drugsComTrain_raw.csv', index=False)

# Convert test data
test_data.to_csv('drugsComTest_raw.csv', index=False)

print("Files successfully converted to CSV!")

Files successfully converted to CSV!


In [None]:
train_data = train_data.dropna()
test_data = test_data.dropna()

features = ['drugName', 'condition', 'usefulCount']
target = 'rating'

train_data = train_data.drop(columns=['review', 'date'])
test_data = test_data.drop(columns=['review', 'date'])

# Convert categorical variables 'drugName' and 'condition' into one-hot encoded columns.
# drop_first=True avoids the dummy variable trap by removing the first category.
train_data_cleaned = pd.get_dummies(train_data, columns=['drugName', 'condition'], drop_first=True)

In [23]:
# Select features and target for training data
X_train = train_data_cleaned.drop('rating', axis=1)
y_train = train_data_cleaned['rating']

# Apply the same preprocessing to test data (dropping NaNs and encoding)
test_data_cleaned = pd.get_dummies(test_data, columns=['drugName', 'condition'], drop_first=True)

# Ensure test data has the same columns as training data
X_test = test_data_cleaned[X_train.columns.intersection(test_data_cleaned.columns)]
y_test = test_data_cleaned['rating']

# Print shapes to verify alignment
print(f"Training Features: {X_train.shape}")
print(f"Test Features: {X_test.shape}")
print(f"Training Target: {y_train.shape}")
print(f"Test Target: {y_test.shape}")

Training Features: (160398, 4315)
Test Features: (53471, 3075)
Training Target: (160398,)
Test Target: (53471,)


In [24]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf.fit(X_train, y_train)

print("Random Forest model trained successfully!")

Random Forest model trained successfully!


In [26]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X_test = test_data_cleaned.reindex(columns=X_train.columns, fill_value=0)

# Make predictions on the test set
y_pred = rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.28

Classification Report:
              precision    recall  f1-score   support

         1.0       0.30      0.31      0.30      7265
         2.0       0.07      0.06      0.07      2324
         3.0       0.07      0.05      0.06      2197
         4.0       0.05      0.04      0.05      1642
         5.0       0.08      0.06      0.07      2691
         6.0       0.07      0.06      0.06      2102
         7.0       0.08      0.07      0.07      3075
         8.0       0.14      0.12      0.13      6118
         9.0       0.21      0.20      0.21      9120
        10.0       0.45      0.54      0.49     16937

    accuracy                           0.28     53471
   macro avg       0.15      0.15      0.15     53471
weighted avg       0.25      0.28      0.26     53471


Confusion Matrix:
[[2240  442  331  242  374  269  363  669  738 1597]
 [ 532  140  163   89  152   84  146  239  279  500]
 [ 453  130  117   97  143   92  143  234  295  493]
 [ 317   86  102   70   