# Part-1

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

# import data
train_data = pd.read_csv('data/train_data.csv')
test_data = pd.read_csv('data/test_data.csv')

# split data into X and y
columns_to_use = ['cit_2017', 'cit_2018', 'cit_2019', 'cit_2020', 'cit_2021', 'cit_2022']
columns_to_predict = ['citation_ratio']

# calculate citation ratio
train_data['citation_ratio'] = (train_data['cit_2022'] / train_data['cit_2021']).round(2)
test_data['citation_ratio'] = (test_data['cit_2022'] / test_data['cit_2021']).round(2)

# 1 means Low, 2 means Medium, 3 means High
train_data['citation_ratio'] = train_data['citation_ratio'].apply(lambda x: 1 if x < 1.05 else (2 if 1.05 <= x <= 1.15 else 3))
test_data['citation_ratio'] = test_data['citation_ratio'].apply(lambda x: 1 if x < 1.05 else (2 if 1.05 <= x <= 1.15 else 3))

X_train = train_data[columns_to_use].values
y_train = train_data[columns_to_predict].values

X_test = test_data[columns_to_use].values
y_test = test_data[columns_to_predict].values

# create a random forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=24)

# train the classifier
rf_classifier.fit(X_train, y_train.ravel())

# predict on the test set
y_pred = rf_classifier.predict(X_test)

# categorize
categories = []

for prediction in y_pred:
    if prediction == 1:
        categories.append('Low')
    elif prediction == 2:
        categories.append('Medium')
    else:
        categories.append('High')

real_categories = []

for ratio in y_test:
    if ratio == 1:
        real_categories.append('Low')
    elif ratio == 2:
        real_categories.append('Medium')
    else:
        real_categories.append('High')

# calculate accuracy
accuracy = 0

for i in range(len(categories)):
    if categories[i] == real_categories[i]:
        accuracy += 1

print(categories)
print(real_categories)

print('Accuracy: ', accuracy / len(categories))

# get feature importances
feature_importances = rf_classifier.feature_importances_

plt.figure(figsize=(10, 6))
plt.barh(columns_to_use, feature_importances)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Random Forest Feature Importance')
plt.show()

# Part-2

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

# import data
train_data = pd.read_csv('data/train_data.csv')
test_data = pd.read_csv('data/test_data.csv')

for year in range(2017, 2022):
    if (train_data[f'cit_{year}'] == 0).any() or (train_data[f'cit_{year+1}'] == 0).any():
        train_data[f'cit_ratio_{year}'] = 0
    else:
        train_data[f'cit_ratio_{year}'] = (train_data[f'cit_{year+1}'] - train_data[f'cit_{year}']) / train_data[f'cit_{year}']
    
    if (test_data[f'cit_{year}'] == 0).any() or (test_data[f'cit_{year+1}'] == 0).any():
        test_data[f'cit_ratio_{year}'] = 0
    else:
        test_data[f'cit_ratio_{year}'] = (test_data[f'cit_{year+1}'] - test_data[f'cit_{year}']) / test_data[f'cit_{year}']

# split data into X and y
columns_to_use = ['cit_ratio_2017', 'cit_ratio_2018', 'cit_ratio_2019', 'cit_ratio_2020', 'cit_ratio_2021']
columns_to_predict = ['citation_ratio']

# calculate citation ratio
train_data['citation_ratio'] = (train_data['cit_2022'] / train_data['cit_2021']).round(2)
test_data['citation_ratio'] = (test_data['cit_2022'] / test_data['cit_2021']).round(2)

# 1 means Low, 2 means Medium, 3 means High
train_data['citation_ratio'] = train_data['citation_ratio'].apply(lambda x: 1 if x < 1.05 else (2 if 1.05 <= x <= 1.15 else 3))
test_data['citation_ratio'] = test_data['citation_ratio'].apply(lambda x: 1 if x < 1.05 else (2 if 1.05 <= x <= 1.15 else 3))

X_train = train_data[columns_to_use].values
y_train = train_data[columns_to_predict].values

X_test = test_data[columns_to_use].values
y_test = test_data[columns_to_predict].values

# create a random forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=24)

# train the classifier
rf_classifier.fit(X_train, y_train.ravel())

# predict on the test set
y_pred = rf_classifier.predict(X_test)

# categorize
categories = []

for prediction in y_pred:
    if prediction == 1:
        categories.append('Low')
    elif prediction == 2:
        categories.append('Medium')
    else:
        categories.append('High')

real_categories = []

for ratio in y_test:
    if ratio == 1:
        real_categories.append('Low')
    elif ratio == 2:
        real_categories.append('Medium')
    else:
        real_categories.append('High')

# calculate accuracy
accuracy = 0

for i in range(len(categories)):
    if categories[i] == real_categories[i]:
        accuracy += 1

print('Accuracy: ', accuracy / len(categories))

# get feature importances
feature_importances = rf_classifier.feature_importances_

plt.figure(figsize=(10, 6))
plt.barh(columns_to_use, feature_importances)
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('Random Forest Feature Importance')
plt.show()