<a href="https://colab.research.google.com/github/RyanBrumbaugh/Projects/blob/main/Improvement_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def sw_removal(comment, stop_words):
    text_tokens = word_tokenize(comment)
    tokens_without_sw = [word for word in text_tokens if word.lower() not in stop_words]
    return " ".join(tokens_without_sw)

def process_data(data, column_name, stop_words):
    data['Last Updated'] = pd.to_datetime(data['Last Updated'], errors='coerce')
    data = data[data['Last Updated'].dt.year == 2023]

    data['Compensation Grade'] = data['Compensation Grade'].replace({'J060': '35', 'J070': '45', 'J080': '45', 'J090': '45',
                                                                     'J100': '55', 'J110': '60', 'J120': '65', 'J130': '70',
                                                                     'J140': '70'})
    data['Compensation Grade'] = pd.to_numeric(data['Compensation Grade'], errors='coerce')
    data = data[data['Compensation Grade'] >= 60]

    processed_data = data.copy()
    columns_to_drop = ['Retention', 'Strengths', 'Potential Assessment Notes',
                       'Current Year -1 Review Rating', 'Current Year -2 Review Rating', 'Hire Date', 'Company',
                       'Location', 'Location Address - Country', 'Job Title', 'Compensation Grade', 'Segment',
                       'Cost Center', 'Functional Area Code', 'Functional Area Name', 'Functional Department',
                       'Reporting Unit', 'Current Year -1 Completed Review', 'Employee ID', 'Manager - Level 01',
                       'Manager Level 01', 'Manager Level 02', 'Manager Level 03', 'Manager Level 04', 'Manager Level 05',
                       'Manager Level 06', 'Manager Level 07', 'Manager Level 08', 'Manager Level 09',
                       'Supervisory Organization', 'Last Updated', 'Current Year -2 Completed Review']
    processed_data = processed_data.drop(columns_to_drop, axis=1).dropna()

    stop_words.update(['viatris', 'company'])
    processed_data[column_name] = processed_data[column_name].apply(lambda x: sw_removal(x, stop_words))

    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))
    column_values = processed_data[column_name].tolist()
    X = vectorizer.fit_transform(column_values)

    return processed_data, X, vectorizer

def predict_categories(X, category_descriptions):
    Y = vectorizer.transform(category_descriptions)
    similarity_scores = cosine_similarity(X, Y)
    predicted_categories = [list(categories.keys())[score.argmax()] for score in similarity_scores]
    return predicted_categories

# Read data
input_file_path = 'Ryans Report.csv'
data = pd.read_csv(input_file_path)

# Process strengths data
strengths_data, X_strengths, vectorizer = process_data(data, 'Strengths', stop_words)

# Process areas of opportunity data
areas_data, X_areas, _ = process_data(data, 'Areas Of Opportunity', stop_words)

# Predict categories for strengths and areas of opportunity
strengths_predicted_categories = predict_categories(X_strengths, category_descriptions)
areas_predicted_categories = predict_categories(X_areas, category_descriptions)

# Add predicted categories to the dataframes
strengths_data['Strengths Predicted Category'] = strengths_predicted_categories
areas_data['Areas Of Opportunity Predicted Category'] = areas_predicted_categories

# Merge dataframes
merged_data = strengths_data.merge(areas_data[['Worker', 'Areas Of Opportunity Predicted Category']], on='Worker', how='left')
final_merged_data = data.merge(merged_data[['Worker', 'Strengths Predicted Category', 'Areas Of Opportunity Predicted Category']], on='Worker', how='left')

# Export to Excel
output_file_path = r'\\file_path'
final_merged_data.to_excel(output_file_path, index=False, encoding='utf-8')

print("DataFrame exported to Excel with UTF-8 encoding successfully.")
