In [None]:
# Data Processing
import pandas as pd
import numpy as np

# Visualisation
import matplotlib.pyplot as plt

# Load the training data
train_data = pd.read_csv('drugsComTrain_raw.tsv', sep='\t')

# Load the test data
test_data = pd.read_csv('drugsComTest_raw.tsv', sep='\t')

# Display the first few rows of each dataset
print("Training Data:")
print(train_data.head())

print("\nTest Data:")
print(test_data.head())

# Optional: Display basic info about the datasets
print("\nTraining Data Info:")
print(train_data.info())

print("\nTest Data Info:")
print(test_data.info())

Training Data:
   Unnamed: 0                  drugName                     condition  \
0      206461                 Valsartan  Left Ventricular Dysfunction   
1       95260                Guanfacine                          ADHD   
2       92703                    Lybrel                 Birth Control   
3      138000                Ortho Evra                 Birth Control   
4       35696  Buprenorphine / naloxone             Opiate Dependence   

                                              review  rating  \
0  "It has no side effect, I take it in combinati...     9.0   
1  "My son is halfway through his fourth week of ...     8.0   
2  "I used to take another oral contraceptive, wh...     5.0   
3  "This is my first time using any form of birth...     8.0   
4  "Suboxone has completely turned my life around...     9.0   

                date  usefulCount  
0       May 20, 2012           27  
1     April 27, 2010          192  
2  December 14, 2009           17  
3   November 3, 2

#### **Training Data**

- **Total Entries**: 161,297 rows
    
- **Columns**:
    
    1. **`Unnamed: 0`**: An index-like column that likely corresponds to the original dataset's row numbers.
    2. **`drugName`**: The name of the drug being reviewed (e.g., "Valsartan").
    3. **`condition`**: The medical condition for which the drug was prescribed (e.g., "ADHD"). Note that some values are missing in this column.
    4. **`review`**: A text review from the patient about the drug.
    5. **`rating`**: A numerical score (out of 10) reflecting the patient’s satisfaction with the drug.
    6. **`date`**: The date the review was written.
    7. **`usefulCount`**: The number of users who found the review helpful.
- **Key Observations**:
    
    - The dataset has missing values in the `condition` column.
    - `review` is textual data, suitable for sentiment analysis or text feature extraction.
    - `rating` is the target variable for regression or classification tasks.

#### **Test Data**

- **Total Entries**: 53,766 rows
- **Columns**: The same as the training data.
- **Key Differences**:
    - Smaller size compared to the training data.
    - The `condition` column also has missing values here.

In [17]:
# Convert training data
train_data.to_csv('drugsComTrain_raw.csv', index=False)

# Convert test data
test_data.to_csv('drugsComTest_raw.csv', index=False)

print("Files successfully converted to CSV!")

Files successfully converted to CSV!


In [None]:
train_data = train_data.dropna()
test_data = test_data.dropna()

features = ['drugName', 'condition', 'usefulCount']
target = 'rating'

train_data = train_data.drop(columns=['review', 'date'])
test_data = test_data.drop(columns=['review', 'date'])

# Convert categorical variables 'drugName' and 'condition' into one-hot encoded columns.
# drop_first=True avoids the dummy variable trap by removing the first category.
train_data_cleaned = pd.get_dummies(train_data, columns=['drugName', 'condition'], drop_first=True)

### **Code Explanation**

#### **Step 1: Drop Rows with Missing Values**

The lines `train_data = train_data.dropna()` and `test_data = test_data.dropna()` remove any rows in the training and test datasets that contain missing values (`NaN`). This is necessary because missing values can disrupt the training process, as machine learning models typically require complete data for all features.

#### **Step 2: Define Features and Target**

The `features` variable specifies the columns to use as input features (`drugName`, `condition`, `usefulCount`), and the `target` variable defines the column to predict (`rating`). This separation ensures the model knows which data to use for predictions and what to predict.

#### **Step 3: Drop Irrelevant Columns**

The lines `train_data = train_data.drop(columns=['review', 'date'])` and `test_data = test_data.drop(columns=['review', 'date'])` remove the `review` and `date` columns. These columns are dropped because they are either uninformative (`date`) or incompatible with the model (`review`, as it contains text).

#### **Step 4: One-Hot Encode Categorical Variables**

The line `train_data_cleaned = pd.get_dummies(train_data, columns=['drugName', 'condition'], drop_first=True)` converts the categorical variables `drugName` and `condition` into one-hot encoded numerical columns. The parameter `drop_first=True` removes the first category to prevent redundancy (known as the dummy variable trap). This step is essential because machine learning models like Random Forest require numerical input and cannot process categorical text directly.

---

### **Purpose of the Code**

This code prepares the data for training by:

1. Cleaning missing values to ensure complete data.
2. Defining the input features and target variable.
3. Removing irrelevant or problematic columns (`review`, `date`).
4. Converting categorical data (`drugName` and `condition`) into a format that the model can understand (one-hot encoding).

In [23]:
# Select features and target for training data
X_train = train_data_cleaned.drop('rating', axis=1)
y_train = train_data_cleaned['rating']

# Apply the same preprocessing to test data (dropping NaNs and encoding)
test_data_cleaned = pd.get_dummies(test_data, columns=['drugName', 'condition'], drop_first=True)

# Ensure test data has the same columns as training data
X_test = test_data_cleaned[X_train.columns.intersection(test_data_cleaned.columns)]
y_test = test_data_cleaned['rating']

# Print shapes to verify alignment
print(f"Training Features: {X_train.shape}")
print(f"Test Features: {X_test.shape}")
print(f"Training Target: {y_train.shape}")
print(f"Test Target: {y_test.shape}")

Training Features: (160398, 4315)
Test Features: (53471, 3075)
Training Target: (160398,)
Test Target: (53471,)


### **Code Explanation**

#### **Step 1: Separate Features and Target for Training Data**

The line `X_train = train_data_cleaned.drop('rating', axis=1)` selects all columns except the target column (`rating`) as the features for training. The target column (`rating`) is assigned to `y_train`.

- **Outcome**:
    - `X_train` contains the input data (independent variables) for training, with 160,398 rows (samples) and 4,315 columns (features).
    - `y_train` contains the corresponding target values (dependent variable), with 160,398 rows (samples).

---

#### **Step 2: Preprocess the Test Data**

The line `test_data_cleaned = pd.get_dummies(test_data, columns=['drugName', 'condition'], drop_first=True)` applies the same one-hot encoding to the test data as was applied to the training data. This step ensures the categorical variables `drugName` and `condition` are transformed into numerical columns for the test data.

---

#### **Step 3: Align Test Data Columns with Training Data**

The line `X_test = test_data_cleaned[X_train.columns.intersection(test_data_cleaned.columns)]` ensures that the test data (`X_test`) has the same set of columns (features) as the training data (`X_train`). Any columns in the training data that are missing in the test data will not be included, and any extra columns in the test data will be excluded.

The target column (`rating`) is then extracted from the test data and assigned to `y_test`.

- **Outcome**:
    - `X_test` contains 53,471 rows (samples) and 3,075 columns (features). The reduced feature count (compared to training) is due to missing categories in the test data.
    - `y_test` contains 53,471 rows (samples) of the corresponding target values.

---

#### **Step 4: Print Shapes to Verify Alignment**

The shapes of the training and test data are printed to ensure alignment:

- `X_train.shape`: (160,398 rows, 4,315 columns) indicates the training features.
- `y_train.shape`: (160,398 rows) indicates the training target.
- `X_test.shape`: (53,471 rows, 3,075 columns) indicates the test features.
- `y_test.shape`: (53,471 rows) indicates the test target.

**Discrepancy**: The number of features in `X_train` (4,315) and `X_test` (3,075) differs, which might cause issues during model prediction unless handled properly (e.g., filling missing columns in the test data with zeros).

In [24]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf.fit(X_train, y_train)

print("Random Forest model trained successfully!")

Random Forest model trained successfully!


## Code Explanation
#### **Step 1: Initialize the Random Forest Model**

The line `rf = RandomForestClassifier(n_estimators=100, random_state=42)` creates an instance of the Random Forest Classifier with the following parameters:

- **`n_estimators=100`**: Specifies the number of decision trees in the forest. More trees generally improve accuracy but increase training time.
- **`random_state=42`**: Sets a random seed to ensure reproducibility of results.

---

#### **Step 2: Train the Model**

The line `rf.fit(X_train, y_train)` trains the Random Forest model using the **training features (`X_train`)** and **training target (`y_train`)**:

- During training, the model creates 100 decision trees, each trained on a random subset of the training data.
- Predictions from all trees are combined (e.g., majority voting for classification) to improve overall accuracy and reduce overfitting.

In [26]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X_test = test_data_cleaned.reindex(columns=X_train.columns, fill_value=0)

# Make predictions on the test set
y_pred = rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.28

Classification Report:
              precision    recall  f1-score   support

         1.0       0.30      0.31      0.30      7265
         2.0       0.07      0.06      0.07      2324
         3.0       0.07      0.05      0.06      2197
         4.0       0.05      0.04      0.05      1642
         5.0       0.08      0.06      0.07      2691
         6.0       0.07      0.06      0.06      2102
         7.0       0.08      0.07      0.07      3075
         8.0       0.14      0.12      0.13      6118
         9.0       0.21      0.20      0.21      9120
        10.0       0.45      0.54      0.49     16937

    accuracy                           0.28     53471
   macro avg       0.15      0.15      0.15     53471
weighted avg       0.25      0.28      0.26     53471


Confusion Matrix:
[[2240  442  331  242  374  269  363  669  738 1597]
 [ 532  140  163   89  152   84  146  239  279  500]
 [ 453  130  117   97  143   92  143  234  295  493]
 [ 317   86  102   70   

## Code Explanation
#### **Step 1: Align Test Data Columns**

The line `X_test = test_data_cleaned.reindex(columns=X_train.columns, fill_value=0)` ensures that the test data (`X_test`) has the same columns as the training data (`X_train`):

- **Why it’s needed**: After one-hot encoding, training and test datasets might have different columns due to missing categories in one dataset. This alignment fills missing columns in `X_test` with zeros, ensuring compatibility with the trained model.

---

#### **Step 2: Make Predictions**

The line `y_pred = rf.predict(X_test)` uses the trained Random Forest model (`rf`) to predict the target values (`rating`) for the test data (`X_test`):

- The model generates predictions based on the features in `X_test` and the patterns it learned from the training data.

---

#### **Step 3: Evaluate the Model**

- **`accuracy_score(y_test, y_pred)`**: Calculates the accuracy of the model (correct predictions divided by total predictions).
    
    - **Outcome**: The model achieves an accuracy of **28%**, which is quite low.
- **`classification_report(y_test, y_pred)`**: Provides precision, recall, and F1-score for each class (ratings 1.0 to 10.0):
    
    - **Precision**: The proportion of correct predictions for a class among all predictions for that class.
        
    - **Recall**: The proportion of correct predictions for a class among all true instances of that class.
        
    - **F1-Score**: The harmonic mean of precision and recall.
        
    - **Outcome**:
        
        - The model performs better for the majority class (rating 10.0), with higher precision and recall.
        - Performance for minority classes (e.g., 2.0, 3.0, 4.0) is very poor, with low precision, recall, and F1-scores.
- **`confusion_matrix(y_test, y_pred)`**: Displays a matrix where rows represent actual classes and columns represent predicted classes:
    
    - Diagonal values represent correct predictions.
        
    - Off-diagonal values indicate misclassifications.
        
    - **Outcome**:
        
        - Most predictions are concentrated around the majority class (rating 10.0).
        - Significant misclassification occurs for other ratings, reflecting the class imbalance and difficulty in predicting minority classes.

---

### **Why Accuracy is Low**

1. **Class Imbalance**: The dataset has a skewed distribution of ratings, with a high proportion of 10.0 ratings. This biases the model towards the majority class.
2. **High Dimensionality**: The model has to process 4,315 features, many of which might be irrelevant or noisy.
3. **Complex Multi-Class Problem**: Predicting 10 distinct ratings is inherently challenging, especially when the data distribution is uneven.
4. **Limited Feature Usefulness**: Features like `drugName` and `condition` may not have strong predictive power for ratings without additional context.

In [28]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Split the data into features (X) and target (y)
X = train_data_cleaned.drop('rating', axis=1)
y = train_data_cleaned['rating']

# Train a temporary Random Forest model to get feature importances
rf_temp = RandomForestClassifier(n_estimators=10, random_state=42)
rf_temp.fit(X, y)

# Select top 500 important features
import numpy as np
feature_importances = rf_temp.feature_importances_
top_features = np.argsort(feature_importances)[-500:]  # Adjust the number as needed

X_reduced = X.iloc[:, top_features]  # Reduce dataset to top features

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Check the new class distribution
from collections import Counter
print(f"Class distribution before SMOTE: {Counter(y)}")
print(f"Class distribution after SMOTE: {Counter(y_resampled)}")

MemoryError: Unable to allocate 5.16 GiB for an array with shape (4315, 160398) and data type int64

In [None]:
# Split resampled data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

print(f"Training Features Shape: {X_train.shape}")
print(f"Training Target Shape: {y_train.shape}")
print(f"Test Features Shape: {X_test.shape}")
print(f"Test Target Shape: {y_test.shape}")

In [None]:
# Initialize and train the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))