# sophia_lda_model_v2 (use more restaurants)

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import string

In [2]:
reviews = pd.read_csv('../data/filtered_reviews.csv')
reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,l3Wk_mvAog6XANIuGQ9C7Q,ZbqSHbgCjzVAqaa7NKWn5A,EQ-TZ2eeD_E0BHuvoaeG5Q,4.0,0,0,0,"Locals recommended Milktooth, and it's an amaz...",2015-08-19 14:31:45
1,vBK79c3_1Ff_oqkh5VpfGg,Ohhrhu1RkqfVciIVx_W5HQ,nRKndeZLQ3eDL10UMwS2rQ,5.0,2,5,2,HOLY SMOKES!\n\nactual pumpkin pie mixed in wi...,2009-10-13 19:49:51
2,z0osLHDvXvzfm57D4DmD2Q,xVKE_HJ2pwUtTdLbL3pnCg,S2Ho8yLxhKAa26pBAm6rxA,3.0,0,0,0,"Service was crappy, and food was mediocre. I ...",2016-11-22 00:22:53
3,elqRpX9T3YwL07uLNtN3Bg,-sryo4gDYxbZ1T5Bz4l5Bw,ltBBYdNzkeKdCNPDAsxwAA,2.0,0,0,0,I at least have to give this restaurant two st...,2015-02-02 04:29:13
4,pHwbdway4yeI-dSSmZA7-Q,qEEk0PuoH1dVa619t8fgpw,PY9GRfzr4nTZeINf346QOw,4.0,0,0,0,We checked in around 2:30 pm. Check-in was qu...,2017-09-20 16:16:47


In [3]:
len(reviews['business_id'].unique())

315

In [6]:
business = pd.read_csv('../sandun_notebooks/csv/yelp_business.csv')
all_restaurants = business[business['categories'].str.contains('restaurant', case=False, na=False)]
all_restaurants.shape

(52286, 14)

In [7]:
# Merge all_restaurants with restaurant reviews on business_id
all_res_merged = pd.merge(reviews,
                         all_restaurants, on='business_id')
all_res_merged.shape

(538450, 22)

In [8]:
all_res_merged['state'].unique() # restaurants are from these 10 states

array(['IN', 'LA', 'TN', 'PA', 'CA', 'MO', 'NV', 'AZ', 'FL', 'ID'],
      dtype=object)

In [9]:
all_res_merged.head()

Unnamed: 0,review_id,user_id,business_id,stars_x,useful,funny,cool,text,date,name,...,state,postal_code,latitude,longitude,stars_y,review_count,is_open,attributes,categories,hours
0,l3Wk_mvAog6XANIuGQ9C7Q,ZbqSHbgCjzVAqaa7NKWn5A,EQ-TZ2eeD_E0BHuvoaeG5Q,4.0,0,0,0,"Locals recommended Milktooth, and it's an amaz...",2015-08-19 14:31:45,Milktooth,...,IN,46203,39.759169,-86.146494,4.0,1379,1,"{'GoodForKids': 'True', 'Alcohol': ""u'full_bar...","Beer, Wine & Spirits, Cafes, Coffee & Tea, Res...","{'Monday': '10:0-15:0', 'Friday': '10:0-15:0',..."
1,z0osLHDvXvzfm57D4DmD2Q,xVKE_HJ2pwUtTdLbL3pnCg,S2Ho8yLxhKAa26pBAm6rxA,3.0,0,0,0,"Service was crappy, and food was mediocre. I ...",2016-11-22 00:22:53,Creole House Restaurant & Oyster Bar,...,LA,70130,29.952213,-90.067116,4.0,1594,1,"{'WiFi': ""'free'"", 'RestaurantsAttire': ""u'cas...","Cajun/Creole, Seafood, Restaurants, Breakfast ...","{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ..."
2,elqRpX9T3YwL07uLNtN3Bg,-sryo4gDYxbZ1T5Bz4l5Bw,ltBBYdNzkeKdCNPDAsxwAA,2.0,0,0,0,I at least have to give this restaurant two st...,2015-02-02 04:29:13,Tavern,...,TN,37203,36.1509,-86.797012,4.0,1222,1,"{'BikeParking': 'True', 'Alcohol': ""u'full_bar...","Cocktail Bars, Nightlife, Gastropubs, Sports B...","{'Monday': '0:0-0:0', 'Wednesday': '10:0-22:0'..."
3,E9AB7V4z8xrt2uPF7T55FQ,iYY5Ii1LGpZCpXFkHlMefw,Zx7n8mdt8OzLRXVzolXNhQ,5.0,0,0,0,Amazing biscuits and (fill in the blank). Grea...,2018-04-27 23:03:21,Milk and Honey Nashville,...,TN,37203,36.154702,-86.784541,4.0,1725,1,"{'WheelchairAccessible': 'True', 'RestaurantsP...","American (New), Restaurants, American (Traditi...","{'Monday': '0:0-0:0', 'Thursday': '6:30-15:0',..."
4,jC-fGfx-YLqxVBcyTAd4Pw,EBa-0-6AKoy6jziNexDJtg,W4ZEKkva9HpAdZG88juwyQ,3.0,0,0,0,"In a word... ""OVERRATED!"". The food took fore...",2013-12-29 02:37:42,Mr. B's Bistro,...,LA,70130,29.954387,-90.068363,4.0,2064,1,"{'NoiseLevel': ""u'average'"", 'RestaurantsReser...","Bars, Breakfast & Brunch, Restaurants, Barbequ...","{'Wednesday': '11:30-20:0', 'Thursday': '11:30..."


### Review stopword list

In [11]:
# Load the English stopwords
stop_words = list(stopwords.words('english'))

# Words you want to keep and not treat as stopwords
words_to_keep = ["doesn't", "again", "aren't", "couldn't", "isn't", "more", "most", "no", "not", "of", "only", "too", "very", "don't", "off", "mustn't", "won't", "wouldn't", "didn't", "shouldn't", "wasn't", "weren't"]

# Remove the words you want to keep from the stopwords list
modified_stop_words = [word for word in stop_words if word not in words_to_keep]

def remove_punctuation(text):
    return "".join(char for char in text if not char in string.punctuation)

preprocessed_stopwords= [remove_punctuation(word) for word in modified_stop_words]

print(preprocessed_stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'youre', 'youve', 'youll', 'youd', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'shes', 'her', 'hers', 'herself', 'it', 'its', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'thatll', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'over', 'under', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'other', 'some', 'such', 'nor', 'own', 'same', 'so', 'than', 's', 't', 'can', 'will', 'just', 'don', 'should', 'shouldve', 'now'

In [12]:
# clean text (reviews) using the modified_stop_words
def clean(text):

    # Remove Punctuation
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ')

    # Lower Case
    lowercased = text.lower()

    # Tokenize
    tokenized = word_tokenize(lowercased)

    # Remove numbers
    words_only = [word for word in tokenized if word.isalpha()]

    # Use modified_stop_words list instead of downloading a new one
    stop_words = set(preprocessed_stopwords)

    # Remove Stop Words using modified list
    without_stopwords = [word for word in words_only if not word in stop_words]

    return " ".join(without_stopwords)

In [13]:
# checking clean_text
# print(all_res_merged.iloc[0].clean_text)
# print(all_res_merged.iloc[0].text)

In [14]:
restaurants_reviews = all_res_merged.drop(columns=['user_id', 'useful', 'funny', 'cool',
                                                   'address', 'postal_code',
                                                   'is_open', 'attributes', 'categories', 'hours'])

In [15]:
restaurants_reviews.rename(columns={'stars_x': 'stars',
                            'stars_y': 'restaurant_avg_star'},
                            inplace=True)

In [16]:
restaurants_reviews.head(3)

Unnamed: 0,review_id,business_id,stars,text,date,name,city,state,latitude,longitude,restaurant_avg_star,review_count
0,l3Wk_mvAog6XANIuGQ9C7Q,EQ-TZ2eeD_E0BHuvoaeG5Q,4.0,"Locals recommended Milktooth, and it's an amaz...",2015-08-19 14:31:45,Milktooth,Indianapolis,IN,39.759169,-86.146494,4.0,1379
1,z0osLHDvXvzfm57D4DmD2Q,S2Ho8yLxhKAa26pBAm6rxA,3.0,"Service was crappy, and food was mediocre. I ...",2016-11-22 00:22:53,Creole House Restaurant & Oyster Bar,New Orleans,LA,29.952213,-90.067116,4.0,1594
2,elqRpX9T3YwL07uLNtN3Bg,ltBBYdNzkeKdCNPDAsxwAA,2.0,I at least have to give this restaurant two st...,2015-02-02 04:29:13,Tavern,Nashville,TN,36.1509,-86.797012,4.0,1222


In [17]:
# Check the balance of the dataset
print(restaurants_reviews['stars'].value_counts())

stars
5.0    272835
4.0    139669
3.0     60183
2.0     34353
1.0     31410
Name: count, dtype: int64


In [18]:
# Sample 5,000 reviews from each score category
balanced_reviews = restaurants_reviews.groupby('stars', group_keys=False).apply(lambda x: x.sample(n=5000, random_state=1)).reset_index(drop=True)

  balanced_reviews = restaurants_reviews.groupby('stars', group_keys=False).apply(lambda x: x.sample(n=5000, random_state=1)).reset_index(drop=True)


In [19]:
balanced_reviews['stars'].value_counts()

stars
1.0    5000
2.0    5000
3.0    5000
4.0    5000
5.0    5000
Name: count, dtype: int64

In [20]:
balanced_reviews.head()

Unnamed: 0,review_id,business_id,stars,text,date,name,city,state,latitude,longitude,restaurant_avg_star,review_count
0,EjsOmXE7yi1wKvnuSlbIwQ,9cu2p2Z925hHkWHVyrp1JQ,1.0,Awful. Awful experience . Will never go back. ...,2017-02-19 23:16:24,Hawkers Asian Street Food,St. Petersburg,FL,27.771325,-82.651594,4.0,1202
1,t3Pr-eGUeGkcFo2ECxjTFA,3SM8CZuY5MiAruczCm556w,1.0,Over-priced and poor management..\n\nComing fr...,2020-07-14 05:15:40,Barbacoa Grill,Boise,ID,43.597584,-116.183693,4.0,1099
2,rFUHyqIewSV3aJsGMxmksg,S2Ho8yLxhKAa26pBAm6rxA,1.0,"We just finished our meal, waited for about 10...",2018-12-05 22:54:29,Creole House Restaurant & Oyster Bar,New Orleans,LA,29.952213,-90.067116,4.0,1594
3,nQxBHhleBdF3u1IDggrQxA,ld_H5-FpZOWm_tkzwkPYQQ,1.0,The staff here is apparently not trained in an...,2020-12-22 21:00:29,Silver Legacy Resort Casino,Reno,NV,39.530502,-119.814995,3.0,1534
4,oV34j9gmonl2FHNehWElVg,AGlh4ZDv6jnoiYfz7At9mw,1.0,This fucking place. Don't even get me started....,2018-09-17 01:38:56,Dim Sum Garden,Philadelphia,PA,39.955399,-75.156727,4.0,2672


In [21]:
# Apply function to all `reviews`
# 3m 2.2s
balanced_reviews['clean_text'] = balanced_reviews['text'].apply(clean)
balanced_reviews.columns

Index(['review_id', 'business_id', 'stars', 'text', 'date', 'name', 'city',
       'state', 'latitude', 'longitude', 'restaurant_avg_star', 'review_count',
       'clean_text'],
      dtype='object')

In [22]:
balanced_reviews.drop(columns=['text'], inplace=True)

In [23]:
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer

min_dfs = {1: 0.02, 2: 0.005, 3: 0.001, 4: 0.0005}

sparse_matrices = []
feature_names = []

for i in range(1, 5):
    vectorizer = TfidfVectorizer(ngram_range=(i, i), min_df=min_dfs[i])
    vectorized_text = vectorizer.fit_transform(balanced_reviews['clean_text'])
    sparse_matrices.append(vectorized_text)
    feature_names.extend([f"{word}" for word in vectorizer.get_feature_names_out()])

    print(f"Number of features for {i} words: {vectorized_text.shape[1]}")

# Horizontally concatenate all sparse matrices
concatenated_vectorized_text = hstack(sparse_matrices)

# Convert the concatenated sparse matrix to a DataFrame
vectorized_text_df = pd.DataFrame(
    concatenated_vectorized_text.toarray(),
    columns=feature_names
)

vectorized_text_df


Number of features for 1 words: 514
Number of features for 2 words: 448
Number of features for 3 words: 417
Number of features for 4 words: 87


Unnamed: 0,able,absolutely,across,actually,again,ago,almost,already,also,although,...,would definitely go again,would definitely go back,would definitely recommend place,would go back again,would never go back,would not come back,would not go back,would not order again,would not recommend place,would not recommend restaurant
0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.168220,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.416830,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24996,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24997,0.0,0.108733,0.0,0.0,0.0,0.122696,0.0,0.0,0.070144,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24998,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.246913,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
# 25 topics: Took 6 min 29 sec
# 30 topics: Took 7 min 12 sec
# Count Vectorizer: 14min 11.1s
from sklearn.decomposition import LatentDirichletAllocation

# Instantiate the LDA
n_components = 30
lda_model = LatentDirichletAllocation(
    n_components=n_components,
    max_iter=500,
    random_state=1,
    n_jobs=-1,
)

# Fit the LDA on the vectorized documents
lda_model.fit(concatenated_vectorized_text)

In [None]:
document_mixture = lda_model.transform(concatenated_vectorized_text)
document_mixture.shape

In [None]:
def print_topics(model, feature_names):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(feature_names[i], topic[i]) for i in topic.argsort()[:-10 - 1:-1]])

def get_topics(model, feature_names):
    topics = []
    for idx, topic in enumerate(model.components_):
        topics.append([(feature_names[i], topic[i]) for i in topic.argsort()[:-10 - 1:-1]])
    return topics

# Use the modified functions
print_topics(lda_model, feature_names)

topics = get_topics(lda_model, feature_names)
topics


In [None]:
def export_topics_to_csv(topics, params):
    topics_df = pd.DataFrame(topics)
    params_string = '_'.join([f'{value}{key}' for key, value in params.items()])
    topics_df.to_csv(f'../sandun_notebooks/csv/topics_{params_string}.csv', index=False)

In [None]:
min_dfs


In [None]:
import csv

param_dict = {
    'topics': n_components,
    'min_dfs': "0.02_0.005_0.001_0.0005",
    'max_iter': '500',
    'topic_word_prior': 0.01
}

# CSV file name
params_string = '_'.join([f'{value}{key}' for key, value in param_dict.items()])
filename = f'./csv/topics_{params_string}.csv'

# Creating the CSV file
with open(filename, 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)

    # Writing the headers
    # csvwriter.writerow(["Topic", "Phrase", "Score"])
    csvwriter.writerow(["phrase", "score", "topic"])

    # Writing the data
    for topic_index, topic_list in enumerate(topics):
        for phrase, score in topic_list:
            csvwriter.writerow([phrase, score, topic_index])

print(f"CSV file '{filename}' created successfully.")

In [None]:
# n_components = 30
# lda_model = LatentDirichletAllocation(
#     n_components=n_components,
#     max_iter=500,
#     random_state=1,
#     n_jobs=-1,
#     topic_word_prior=0.01,
# )


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.preprocessing import StandardScaler

In [None]:
key_phrases = pd.read_csv('./csv/topics_30topics_0.02_0.005_0.001_0.0005min_dfs_500max_iter_0.01topic_word_prior.csv')

In [None]:
key_phrases

In [None]:
topics

In [None]:
len(topics)

In [None]:
from openai import OpenAI
import os
os.environ.setdefault('OPENAI_API_KEY', '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')
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
OpenAI.api_key = OPENAI_API_KEY
client = OpenAI()

# Prompt for the AI model
prompt = f"""I am providing a list with 30 internal lists.
Each internal list contains phrases pertaining to a particular topic.
Each phrase has a score pertaining to how likely it is to be associated with the topic.
Please provide a name for each of the 30 topics, in 3 words or less, based on the phrases and scores provided.
(Try to keep within 2 words, just using words from the phrases if possible)

{topics}

"""

# print(prompt)
# Make a request to the API to generate text

response = client.chat.completions.create(
  model="gpt-4-0125-preview",
  messages = [
        {"role": "system",
         "content": """Give the response as a python list assigned to variable
         'topic_labels', and give each string quoted in double quotations,
         not single quotes. Ensure the list contains 30 items"""},
        {"role": "user", "content": prompt}
    ],
  max_tokens = 1000,
)

print(response.choices[0].message.content)

In [None]:
topic_labels = ["Customer Service Experience", "Bar Atmosphere", "Mediocre Food", "Lacking Value", "New Orleans Culture", "Long Waits", "Food Quality", "Southern Dishes", "Shrimp Po' Boy", "Returning Doubtful", "Taste Preferences", "Southern Comfort Food", "Happy Hour Selection", "Cheese and Steak", "Philadelphia Cheesesteak", "Reliable Staff", "Quality Dining", "Outstanding Service", "Satisfactory Experience", "Dissatisfied Feedback", "Return Intent", "Southern Classics", "Outdoor Seating", "Service Issues", "Extraordinary Recommendation", "Oyster Specialties", "Breakfast Choices", "Favorite Spot", "Top Picks", "Tourist Avoidance"]

In [None]:
# Convert topic labels to csv of a single column
topic_labels_df = pd.DataFrame(topic_labels, columns=['topic_label'])
topic_labels_df.to_csv('./csv/topic_labels.csv', index=False)


In [None]:
len(topic_labels)

In [None]:
mapping_dict = {i: label for i, label in enumerate(topic_labels)}

# Map 'topic' column to 'topic_label' using the labels list
# We use the 'topic' values directly as indexes for the 'topic_labels' list
key_phrases['topic_label'] = key_phrases['topic'].map(lambda x: topic_labels[x])
key_phrases

In [None]:
key_phrases.to_csv('./csv/topic_allocation_with_labels.csv', index=False)

In [None]:
n_topics = key_phrases['topic'].nunique()
n_topics

In [None]:
df = pd.read_csv('./csv/df_review_top10.csv')
df.stars.mean(), df.stars.min(), df.stars.max(), df.shape

In [None]:
df

In [None]:
# Add 20 new columns with names "topics1" to "topics20"
for i in range(1, n_topics + 1):
    col_name = topic_labels[i - 1]
    df[col_name] = None
df.columns

In [None]:
df

In [None]:
# Initialize the new columns with zeros
for i in range(1, n_topics + 1):
    col_name = topic_labels[i-1]
    df[col_name] = 0.0  # Initialize with 0.0 (or any other default value)

# Iterate through each row of the dataframe
for index, row in df.iterrows():
    clean_text = row['clean_text']

    # Iterate through each phrase
    for phrase, score, topic in zip(key_phrases['phrase'], key_phrases['score'], key_phrases['topic']):
        # Check if the phrase appears in the clean_text
        if phrase in clean_text:
            # If it does, sum the score to the corresponding topic
            df.at[index, topic_labels[topic]] +=  score

In [None]:
df

In [None]:
# Define features and target variable
X = df.drop(['review_id', 'business_id', 'name', 'text', 'restaurant_avg_star', 'review_count', 'avg_stars10m_radius', 'date', 'clean_text', 'stars'], axis=1)  # Features

y = df['stars']  # Target variable

X.columns

In [None]:
# Initialize the scaler
scaler = StandardScaler()

# Fit and transform your feature matrix (X)
X_standardized = scaler.fit_transform(X)
X_standardized

In [None]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_standardized, y, test_size=0.2, random_state=42)

# Initialize and train the regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate R-squared
r_squared = r2_score(y_test, y_pred)
print("R-squared:", r_squared)

# Calculate mean absolute error
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

# Calculate error
mape = mean_absolute_percentage_error(y_test, y_pred)
print("Mean absolute percentage error:", mape)

In [None]:
X_train.sum(axis=0)

In [None]:
coefficients = model.coef_
feature_names = X.columns
df_coefficients = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

# If you want to include the intercept as well
intercept_row = pd.DataFrame({'Feature': ['Intercept'], 'Coefficient': [model.intercept_]})
df_coefficients = pd.concat([intercept_row, df_coefficients], ignore_index=True)
df_coefficients = df_coefficients.sort_values("Coefficient", ascending=False)
df_coefficients.to_csv('Top_Coefficients.csv', index=False)

In [None]:
coefficients_dict = df_coefficients.set_index('Feature')['Coefficient'].to_dict()

In [None]:
coefficients_dict.pop('Intercept')

In [None]:
coefficients_dict

In [None]:
# Get top 5
top_5 = dict(sorted(coefficients_dict.items(), key=lambda item: item[1], reverse=True)[:5])

# Get bottom 5
bottom_5 = dict(sorted(coefficients_dict.items(), key=lambda item: item[1])[:5])


top_5

In [None]:
bottom_5