In [1]:
# Let's import json to create the responses un JSON format.
import json

In [2]:
import pandas as pd
import numpy as np

df_F12 = pd.read_parquet('F12.parquet')
df_F12

Unnamed: 0,user_id,release_year,genres,sum_playtime_forever
0,--000--,2006,"[Indie, Simulation]",1850.0
1,--000--,2009,[Action],5329.0
2,--000--,2010,"[Action, Free to Play, Indie]",66.0
3,--000--,2011,"[Action, Adventure, Indie, RPG, Action, Free t...",15564.0
4,--000--,2012,"[Action, Adventure, Casual, Indie, RPG, Action...",118326.0
...,...,...,...,...
660198,zzzmidmiss,2010,"[Action, Free to Play, Indie, Adventure, Strat...",1237.0
660199,zzzmidmiss,2011,"[Action, Free to Play, Massively Multiplayer, ...",2514.0
660200,zzzmidmiss,2012,"[Action, Adventure, Action, Free to Play, Indi...",14467.0
660201,zzzmidmiss,2013,"[Action, Adventure, Indie, RPG, Action, Free t...",423.0


In [3]:
df_F12.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660203 entries, 0 to 660202
Data columns (total 4 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   user_id               660203 non-null  object 
 1   release_year          660203 non-null  int64  
 2   genres                660203 non-null  object 
 3   sum_playtime_forever  660203 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 20.1+ MB


The columns genres of the df_F12 dataframe comes as an array from the .parquet file. To optimize the performance of the functions, it is better to convert those items into lists.

In [4]:
def convert_to_list(genres_array):
    return [element.strip("'") for element in genres_array]

# Aplica la función a la columna 'genres' y reemplaza la columna
df_F12['genres'] = df_F12['genres'].apply(convert_to_list)
df_F12

Unnamed: 0,user_id,release_year,genres,sum_playtime_forever
0,--000--,2006,"[Indie, Simulation]",1850.0
1,--000--,2009,[Action],5329.0
2,--000--,2010,"[Action, Free to Play, Indie]",66.0
3,--000--,2011,"[Action, Adventure, Indie, RPG, Action, Free t...",15564.0
4,--000--,2012,"[Action, Adventure, Casual, Indie, RPG, Action...",118326.0
...,...,...,...,...
660198,zzzmidmiss,2010,"[Action, Free to Play, Indie, Adventure, Strat...",1237.0
660199,zzzmidmiss,2011,"[Action, Free to Play, Massively Multiplayer, ...",2514.0
660200,zzzmidmiss,2012,"[Action, Adventure, Action, Free to Play, Indi...",14467.0
660201,zzzmidmiss,2013,"[Action, Adventure, Indie, RPG, Action, Free t...",423.0


In [5]:
df_F345 = pd.read_parquet('F345.parquet')
df_F345

Unnamed: 0,item_id,title,release_year,genres,user_id,sum_playtime_forever,recommend,sentiment_analysis,review_year
0,10.0,Counter-Strike,2000,['Action'],71251241,566.0,True,2,2015
1,10.0,Counter-Strike,2000,['Action'],76561198015886143,144786.0,True,1,2015
2,10.0,Counter-Strike,2000,['Action'],76561198040188061,3895.0,True,2,2011
3,10.0,Counter-Strike,2000,['Action'],76561198072207162,54.0,True,2,2014
4,10.0,Counter-Strike,2000,['Action'],76561198076217855,3848.0,True,2,2014
...,...,...,...,...,...,...,...,...,...
39144,521430.0,Super Switch,2016,['Indie'],76561198075141715,5.0,True,2,2016
39145,521570.0,You Have 10 Seconds 2,2016,"['Casual', 'Free to Play', 'Indie']",76561198071122396,25.0,True,0,2016
39146,521570.0,You Have 10 Seconds 2,2016,"['Casual', 'Free to Play', 'Indie']",TfhuAWGscvg,54.0,True,0,2016
39147,521990.0,Galactic Storm,2016,"['Action', 'Indie']",mikeyg74,16.0,True,2,2016


In [6]:
df_F345.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39149 entries, 0 to 39148
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   item_id               39149 non-null  float64
 1   title                 39149 non-null  string 
 2   release_year          39149 non-null  int64  
 3   genres                39015 non-null  object 
 4   user_id               39149 non-null  object 
 5   sum_playtime_forever  39149 non-null  float64
 6   recommend             39149 non-null  bool   
 7   sentiment_analysis    39149 non-null  int64  
 8   review_year           39149 non-null  int64  
dtypes: bool(1), float64(2), int64(3), object(2), string(1)
memory usage: 2.4+ MB


Now, let's take a look at the functions or queries that we need to develop:

1. **PlayTimeGenre**(`genre`: str) Function:

It should return the release year with the most played hours for the given `genre`.

Example of return: {"Release year with the most played hours for Genre X" : 2013}

2. **UserForGenre**(`genre`: str) Function:

It should return the user who has accumulated the most played hours for the given `genre` and a list of the accumulated playtime by release year.

Example of return: {"User with the most played hours for Genre X" : us213ndjss09sdf, "Playtime":[{Year: 2013, Hours: 203}, {Year: 2012, Hours: 100}, {Year: 2011, Hours: 23}]}

3. **UsersRecommend**(`year`: int) Function:

It returns the top 3 games MOST recommended by users for the given `year` (reviews.recommend = True and positive/neutral comments).

Example of return: [{"Rank 1" : X}, {"Rank 2" : Y},{"Rank 3" : Z}]

4. **UsersNotRecommend**(`year`: int) Function:

It returns the top 3 games LEAST recommended by users for the given `year` (reviews.recommend = False and negative comments).

Example of return: [{"Rank 1" : X}, {"Rank 2" : Y},{"Rank 3" : Z}]

5. **Sentiment_analysis**(`year`: int) Function:

Based on the release `year`, it returns a list with the count of user review records categorized with sentiment analysis.

Example of return: {"Negative": 182, "Neutral": 120, "Positive": 278}

### 1. **PlayTimeGenre**(`genre`: str) Function:

It should return the release year with the most played hours for the given `genre`.

Example of return: {"Release year with the most played hours for Genre X" : 2013}

For this function, we may need only the following columns of the df_F12 dataframe:

- genres.
- release_year.
- sum_playtime_forever.

In [7]:
df_f1 = df_F12[['genres','release_year','sum_playtime_forever']]
df_f1

Unnamed: 0,genres,release_year,sum_playtime_forever
0,"[Indie, Simulation]",2006,1850.0
1,[Action],2009,5329.0
2,"[Action, Free to Play, Indie]",2010,66.0
3,"[Action, Adventure, Indie, RPG, Action, Free t...",2011,15564.0
4,"[Action, Adventure, Casual, Indie, RPG, Action...",2012,118326.0
...,...,...,...
660198,"[Action, Free to Play, Indie, Adventure, Strat...",2010,1237.0
660199,"[Action, Free to Play, Massively Multiplayer, ...",2011,2514.0
660200,"[Action, Adventure, Action, Free to Play, Indi...",2012,14467.0
660201,"[Action, Adventure, Indie, RPG, Action, Free t...",2013,423.0


In [8]:
df_f1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660203 entries, 0 to 660202
Data columns (total 3 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   genres                660203 non-null  object 
 1   release_year          660203 non-null  int64  
 2   sum_playtime_forever  660203 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 15.1+ MB


Now as part of the process of validating that the genre entered by the user is valid, we need to know all the genres that are in the dataset.

In [9]:
unique_genres = set()
for genres_list in df_f1['genres']:
    unique_genres.update(genres_list)

# Let's obtain the unique list of genres
unique_genres_list = list(unique_genres)
unique_genres_list

['Utilities',
 'Early Access',
 'Web Publishing',
 'Education',
 'Action',
 'Design &amp; Illustration',
 'Video Production',
 'Simulation',
 'Adventure',
 'Racing',
 'Sports',
 'Strategy',
 'Photo Editing',
 'Casual',
 'Massively Multiplayer',
 'Indie',
 'RPG',
 'Free to Play',
 'Animation &amp; Modeling',
 'Audio Production',
 'Software Training']

To improve the function performance, we need that all the genres are in the same format. So we are going to capitalize all the words using the following function:

In [10]:
def normalize_list_of_words(sentences_list):
    # Function to normalize a list of sentences capitalizing the first letter of each word.
    normalized_sentences = []
    for sentence in sentences_list:
        words = sentence.split()  # We divide the sentence into words
        normalized_words = [word.capitalize() for word in words]  # We capitalize each word
        normalized_sentence = ' '.join(normalized_words)  # Join the words into a sentence
        normalized_sentences.append(normalized_sentence)
    return normalized_sentences

In [11]:
normalized_genres = normalize_list_of_words(unique_genres_list)
normalized_genres

['Utilities',
 'Early Access',
 'Web Publishing',
 'Education',
 'Action',
 'Design &amp; Illustration',
 'Video Production',
 'Simulation',
 'Adventure',
 'Racing',
 'Sports',
 'Strategy',
 'Photo Editing',
 'Casual',
 'Massively Multiplayer',
 'Indie',
 'Rpg',
 'Free To Play',
 'Animation &amp; Modeling',
 'Audio Production',
 'Software Training']

We can check that there are 21 genres in the dataset. We are going to use this list as a validation tool in the function.

This function has the following steps:

1. Evaluates that the genre inserted by the user is a string.
2. Nomalize the genre entered by the user capitalizing the first word of each word.
3. Check if the genre normalized is in the list of normalized genres.
4. There are two genres that need special attention: "Free to Play","RPG". They are not normalized in the original dataframe of genres. We need to make sure that they are written the same way they are in the original dataframe.
5. We select all the rows that contain that genre in the dataframe.
6. In the filtered dataframe, we group by 'release_year' summing 'sum_playtime_forever'.
7. Finally we select the 'release_year' that has the maximum 'sum_playtime_forever'.
8. Organize the results as a JSON format.

**NOTE:** This function has some quality features that send messages to the user when the genre is written with the correct letters but not respecting the standardized way of writting (First letter in capital and the rest without capitalizing). Also, send a message when the user inserts a genre that does not exist or write something different of string.  

Also, we need to create a function to capitalize the first leter of a word in order to estandarize the words entered by the user.

In [12]:
def capitalize_first_words_in_sentence(sentence):
    # Function to capitalize the first letter of each word in a sentence
    words = sentence.split()  # Split the sentence into words
    capitalized_words = [word[0].capitalize() + word.lower()[1:] for word in words]  # Capitalize the first letter of each word
    return ' '.join(capitalized_words)  # Join the words into a capitalized sentence

Now, let's go to the function:

In [13]:
def PlayTimeGenre(genre):
    # This function returns the release year with the most played hours for the given 'genre'.
    # Example of return: {"Release year with the most played hours for Genre X" : 2013}
    
    #We need to mke sure that the genre inserted by the user is a string
    if type(genre) == str:
        # The first thing that we need to do is to validate that the genre entered by the user is in the list of genres.
        # We need to normalize the genre entered by the user.
        norm_genre = capitalize_first_words_in_sentence(genre)

        #Now, let's look for that genre in the normalized list of genres:
        if norm_genre in normalized_genres:

            # Let's suppose that we are going to look for the genre 'norm_genre' as an input from the user.
            genre_to_find = norm_genre

            # There are two genres that need special attention: "Free to Play","RPG". Because they are not normalized in the
            # original dataframe where we are going to search later. So let's transform them:
            if norm_genre == 'Free To Play':
                genre_to_find = 'Free to Play'
            elif norm_genre == 'Rpg':
                genre_to_find = 'RPG'

            # Create a mask to select the rows that contain the desired genre and handle NaN values
            mask = df_f1['genres'].apply(lambda x: genre_to_find in x)

            # We filtrate the DataFrame with the mask
            df_f1_by_genre = df_f1[mask]

            # We need to do a group by 'release_year' summing 'sum_playtime_forever'
            grouped_df_f1_by_year = df_f1_by_genre.groupby('release_year')['sum_playtime_forever'].sum().reset_index()

            # Now we only need the 'release_year' and 'sum_playtime_forever'
            df1genre = grouped_df_f1_by_year[['release_year','sum_playtime_forever']]

            # Now we sort by the 'sum_playtime_forever' to have the year with most played hours:
            df1genre.sort_values(by='sum_playtime_forever', ascending=False, inplace=True)

            # We reset the index
            df1genre.reset_index(drop=True, inplace=True)

            year_most_hours_played = df1genre.iloc[0,0]
            max_sum_playtime_forever = df1genre.iloc[0,1]

            # Now, we create the message in JSON format
            F1message = {
                f"Release year with the most played hours for Genre {genre_to_find}": int(year_most_hours_played)
            }
            # We convert the message into a JSON format
            json_message = json.dumps(F1message, indent=4)

            return print(json_message)
            
        else:
            return print("The genre entered is not valid. Please try again.")
    else:
        return print("Plese insert a genre as a string value (Text).")

In [14]:
PlayTimeGenre('Action')

{
    "Release year with the most played hours for Genre Action": 2012
}


In [15]:
PlayTimeGenre('Free TO PLay')

{
    "Release year with the most played hours for Genre Free to Play": 2012
}


In [16]:
PlayTimeGenre('AdvEnture')

{
    "Release year with the most played hours for Genre Adventure": 2013
}


In [17]:
PlayTimeGenre('AniMAtion &amp; ModELing')

{
    "Release year with the most played hours for Genre Animation &amp; Modeling": 2015
}


In [18]:
PlayTimeGenre('Game')

The genre entered is not valid. Please try again.


In [19]:
PlayTimeGenre(2000)

Plese insert a genre as a string value (Text).


### 2. **UseForGenre**(`genre`: str) Function:

It should return the user who has accumulated the most played hours for the given `genre` and a list of the accumulated playtime by release year.

Example of return: {"User with the most played hours for Genre X" : us213ndjss09sdf, "Playtime":[{Year: 2013, Hours: 203}, {Year: 2012, Hours: 100}, {Year: 2011, Hours: 23}]}

For this function, we may need only the following columns of the df_F12 dataframe:

- user_id.
- genres.
- release_year.
- sum_playtime_forever.

In [20]:
df_f2 = df_F12[['user_id','genres','release_year','sum_playtime_forever']]
df_f2

Unnamed: 0,user_id,genres,release_year,sum_playtime_forever
0,--000--,"[Indie, Simulation]",2006,1850.0
1,--000--,[Action],2009,5329.0
2,--000--,"[Action, Free to Play, Indie]",2010,66.0
3,--000--,"[Action, Adventure, Indie, RPG, Action, Free t...",2011,15564.0
4,--000--,"[Action, Adventure, Casual, Indie, RPG, Action...",2012,118326.0
...,...,...,...,...
660198,zzzmidmiss,"[Action, Free to Play, Indie, Adventure, Strat...",2010,1237.0
660199,zzzmidmiss,"[Action, Free to Play, Massively Multiplayer, ...",2011,2514.0
660200,zzzmidmiss,"[Action, Adventure, Action, Free to Play, Indi...",2012,14467.0
660201,zzzmidmiss,"[Action, Adventure, Indie, RPG, Action, Free t...",2013,423.0


In [21]:
df_f2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660203 entries, 0 to 660202
Data columns (total 4 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   user_id               660203 non-null  object 
 1   genres                660203 non-null  object 
 2   release_year          660203 non-null  int64  
 3   sum_playtime_forever  660203 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 20.1+ MB


For this function we can use as a base the function we developed before, but let's add the user_id into the equation.

This function has the following steps:

1. Evaluates that the genre inserted by the user is a string.
2. Nomalize the genre entered by the user capitalizing the first word of each word.
3. Check if the genre normalized is in the list of normalized genres.
4. There are two genres that need special attention: "Free to Play","RPG". They are not normalized in the original dataframe of genres. We need to make sure that they are written the same way they are in the original dataframe.
5. We select all the rows that contain that genre in the dataframe.
6. In the filtered dataframe, we group by 'release_year' and 'user_id' summing 'sum_playtime_forever' (Dataframe 1)
7. In parallel we are going to create another dataframe (Dataframe 2) grouping by 'user_id' summing 'sum_playtime_forever' in order to know who is the user with the highest 'sum_playtime_forever'.
8. Now, having the 'user_id' with the highest 'sum_playtime_forever', finally we select the rows of the Dataframe 1 that contains that 'user_id' in order to have all the years with the 'sum_playtime_forever' for that 'user_id' and the genre inserted by the user.
9. Organize the results as a JSON format.

**NOTE:** This function has some quality features that send messages to the user when the genre is written with the correct letters but not respecting the standardized way of writting (First letter in capital and the rest without capitalizing). Also, send a message when the user inserts a genre that does not exist or write something different of string.  

In [22]:
def UseForGenre(genre):
    # This function return the user who has accumulated the most played hours for the given `genre` and a list of 
    # the accumulated playtime by release year.
    #Example of return: {"User with the most played hours for Genre X" : us213ndjss09sdf, 
    #                     "Playtime":[{Year: 2013, Hours: 203}, {Year: 2012, Hours: 100}, {Year: 2011, Hours: 23}]}

    #We need to mke sure that the genre inserted by the user is a string
    if type(genre) == str:
    
        # The first thing that we need to do is to validate that the genre entered by the user is in the list of genres.
        # We need to normalize the genre entered by the user.
        norm_genre = capitalize_first_words_in_sentence(genre)

        #Now, let's look for that genre in the normalized list of genres:
        if norm_genre in normalized_genres:

            # Let's suppose that we are going to look for the genre 'norm_genre' as an input from the user.
            genre_to_find = norm_genre

            # There are two genres that need special attention: "Free to Play","RPG". Because they are not normalized in the
            # original dataframe where we are going to search later. So let's transform them:
            if norm_genre == 'Free To Play':
                genre_to_find = 'Free to Play'
            elif norm_genre == 'Rpg':
                genre_to_find = 'RPG'

            # We create a mask to select the rows that contain the desired genre and we take care of the NaN values
            mask = df_f2['genres'].apply(lambda x: genre_to_find in x)

            # We filtrate the DataFrame with the mask
            df_f2_by_genre = df_f2[mask]

            # We need to do a group by 'user_id' and 'release_year' summing 'sum_playtime_forever'
            grouped_df_f2_by_user_year = df_f2_by_genre.groupby(['user_id','release_year'])['sum_playtime_forever'].sum().reset_index()

            # Also we are going to group by 'user_id' summing 'sum_playtime_forever' in order to find the user that played the most.
            grouped_df_f2_by_user = df_f2_by_genre.groupby(['user_id'])['sum_playtime_forever'].sum().reset_index()

            # Now we sort by the 'sum_playtime_forever' to have the year with most played hours:
            grouped_df_f2_by_user.sort_values(by='sum_playtime_forever', ascending=False, inplace=True)

            # We reset the index
            grouped_df_f2_by_user.reset_index(drop=True, inplace=True)

            #User with more hours played
            user_most_hours_played = grouped_df_f2_by_user.iloc[0,0]

            # Now we are going to make a mask in the dataframe 'grouped_df_f2_by_user_year' just to have the data of that user
            # that had the maximum hours played
            mask = grouped_df_f2_by_user_year['user_id'] == user_most_hours_played
            resultF2 = grouped_df_f2_by_user_year[mask]

            #Now, we create a list of dictionaries for the year and sum_playtime_forever
            playtime_list = resultF2.rename(columns={'release_year': 'Release Year', 'sum_playtime_forever': 'Hours'})[['Release Year', 'Hours']].to_dict(orient='records')

            # Now, we create the message in JSON format
            F2message = {
                f"User with the most played hours for Genre {genre_to_find}": user_most_hours_played,
                "Playtime": playtime_list
            }
            # We convert the message into a JSON format
            json_message = json.dumps(F2message, indent=4)

            return print(json_message)
        else:
            return print("The genre entered is not valid. Please try again.")
    else:
        return print("Plese insert a genre as a string value (Text).")

In [23]:
UseForGenre('Action')

{
    "User with the most played hours for Genre Action": "REBAS_AS_F-T",
    "Playtime": [
        {
            "Release Year": 1998,
            "Hours": 2.0
        },
        {
            "Release Year": 1999,
            "Hours": 227.0
        },
        {
            "Release Year": 2001,
            "Hours": 33.0
        },
        {
            "Release Year": 2002,
            "Hours": 1.0
        },
        {
            "Release Year": 2003,
            "Hours": 5589.0
        },
        {
            "Release Year": 2004,
            "Hours": 9102.0
        },
        {
            "Release Year": 2005,
            "Hours": 5519.0
        },
        {
            "Release Year": 2006,
            "Hours": 3556.0
        },
        {
            "Release Year": 2007,
            "Hours": 11585.0
        },
        {
            "Release Year": 2008,
            "Hours": 9663.0
        },
        {
            "Release Year": 2009,
            "Hours": 118276.0
        },
 

In [24]:
UseForGenre('MAssIvely MultipLayer')

{
    "User with the most played hours for Genre Massively Multiplayer": "REBAS_AS_F-T",
    "Playtime": [
        {
            "Release Year": 2010,
            "Hours": 134037.0
        },
        {
            "Release Year": 2011,
            "Hours": 452856.0
        },
        {
            "Release Year": 2012,
            "Hours": 560808.0
        },
        {
            "Release Year": 2013,
            "Hours": 955010.0
        },
        {
            "Release Year": 2014,
            "Hours": 1129480.0
        },
        {
            "Release Year": 2015,
            "Hours": 2772390.0
        },
        {
            "Release Year": 2016,
            "Hours": 3282602.0
        },
        {
            "Release Year": 2017,
            "Hours": 121699.0
        }
    ]
}


In [25]:
UseForGenre('DesiGN &amp; IllusTRation')

{
    "User with the most played hours for Genre Design &amp; Illustration": "thugnificent",
    "Playtime": [
        {
            "Release Year": 2007,
            "Hours": 6987.0
        },
        {
            "Release Year": 2012,
            "Hours": 109437.0
        },
        {
            "Release Year": 2013,
            "Hours": 162179.0
        },
        {
            "Release Year": 2014,
            "Hours": 290685.0
        },
        {
            "Release Year": 2015,
            "Hours": 632128.0
        },
        {
            "Release Year": 2016,
            "Hours": 480223.0
        }
    ]
}


In [26]:
UseForGenre('Game')

The genre entered is not valid. Please try again.


In [27]:
UseForGenre(2000)

Plese insert a genre as a string value (Text).


### 3. **UsersRecommend**(`year`: int) Function:

It returns the top 3 games MOST recommended by users for the given `year` (reviews.recommend = True and positive/neutral comments).

Example of return: [{"Rank 1" : X}, {"Rank 2" : Y},{"Rank 3" : Z}]

For this function, we may need only the following columns of the df_F345 dataframe:

- item_id.
- title.
- recommend.
- sentiment_analysis.
- review_year.

In [28]:
df_f3 = df_F345[['item_id','title','recommend','sentiment_analysis','review_year']]
df_f3

Unnamed: 0,item_id,title,recommend,sentiment_analysis,review_year
0,10.0,Counter-Strike,True,2,2015
1,10.0,Counter-Strike,True,1,2015
2,10.0,Counter-Strike,True,2,2011
3,10.0,Counter-Strike,True,2,2014
4,10.0,Counter-Strike,True,2,2014
...,...,...,...,...,...
39144,521430.0,Super Switch,True,2,2016
39145,521570.0,You Have 10 Seconds 2,True,0,2016
39146,521570.0,You Have 10 Seconds 2,True,0,2016
39147,521990.0,Galactic Storm,True,2,2016


In [29]:
df_f3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39149 entries, 0 to 39148
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   item_id             39149 non-null  float64
 1   title               39149 non-null  string 
 2   recommend           39149 non-null  bool   
 3   sentiment_analysis  39149 non-null  int64  
 4   review_year         39149 non-null  int64  
dtypes: bool(1), float64(1), int64(2), string(1)
memory usage: 1.2 MB


This function returns the top 3 games MOST recommended by users for the given `year` (reviews.recommend = True and positive/neutral comments).

Example of return: [{"Rank 1" : item1}, {"Rank 2" : item2},{"Rank 3" : item3}]

To select the most recommended items we are going to use the following criteria:

1. Eliminate all the rows that have 'recommend = False' and have sentiment_analysis = 0
2. Select only the rows that have the review_year = year entered by the user. 
3. Group by item_id summing all the sentiment_analysis and sort the dataframe by sum of sentiment_analysis (Descending).
4. Select the top 3 items with the highest sum of sentiment analysis.
5. Organize the results as a JSON format.

**NOTE:** This function has some quality features that send messages to the user when the the user inserts a year that does not exist or write something different of integer.


We can do the step one, outside of the function:

In [30]:
mask = (df_f3['recommend'] == True) & (df_f3['sentiment_analysis'] != 0)
df_f3f = df_f3[mask]
df_f3 = df_f3f.reset_index(drop=True)
df_f3

Unnamed: 0,item_id,title,recommend,sentiment_analysis,review_year
0,10.0,Counter-Strike,True,2,2015
1,10.0,Counter-Strike,True,1,2015
2,10.0,Counter-Strike,True,2,2011
3,10.0,Counter-Strike,True,2,2014
4,10.0,Counter-Strike,True,2,2014
...,...,...,...,...,...
31000,514520.0,Sparky's Hunt,True,2,2016
31001,516040.0,Everything is Peachy,True,2,2016
31002,521340.0,True or False,True,2,2016
31003,521430.0,Super Switch,True,2,2016


Now, let's go to the function:

In [31]:
def UsersRecommend(year):
    # This function returns the top 3 games MOST recommended by users for the given `year` (reviews.recommend = True and positive/neutral comments).
    # Example of return: [{"Rank 1" : item1}, {"Rank 2" : item2},{"Rank 3" : item3}]

    # First, we need to make sure that the year inserted by the user is a number
    if type(year) == int: 
        # Now, let's select only the data for the year given by the user.
        mask = df_f3['review_year'] == year
        df_f3_review_year = df_f3[mask].reset_index(drop=True)

        # Group by 'title' summing all the sentiment_analysis and sort the dataframe by sum of sentiment_analysis.
        grouped_df_f3_review_year = df_f3_review_year.groupby(['title'])['sentiment_analysis'].sum().reset_index()

        # Now we sort by the 'sentiment_analysis' to have the items with the highest sentiment_analysis:
        grouped_df_f3_review_year.sort_values(by='sentiment_analysis', ascending=False, inplace=True)

        # We need to verify if the Dataframe is not empty, which means that there are no reviews for that year:
        if not grouped_df_f3_review_year.empty:
            #Rank 1
            item_Rank_1 = grouped_df_f3_review_year.iloc[0,0]
            #Rank 2
            item_Rank_2 = grouped_df_f3_review_year.iloc[1,0]
            #Rank 3
            item_Rank_3 = grouped_df_f3_review_year.iloc[2,0]

            #Now, we create the dataframe that we are going to convert into JSON Format
            dataF3 = {
                'Rank': ['Position 1', 'Position 2', 'Position 3'],
                'title': [item_Rank_1, item_Rank_2, item_Rank_3]
            }

            #Now, we create a list of dictionaries for the Rank and item_id
            json_data = [{"Rank " + str(index + 1): item} for index, item in enumerate(dataF3['title'])]

            # Now, we create the message in JSON format
            json_message = json.dumps(json_data, indent=None, ensure_ascii=False)

            return print(json_message)
        else:
            return print("The year inserted has not reviews to calculate the ranking of the most recommended items. Please try with another year.")
    else:
        return print("Please insert a valid year as an integer number.")

In [32]:
UsersRecommend(2000)

The year inserted has not reviews to calculate the ranking of the most recommended items. Please try with another year.


In [33]:
UsersRecommend(2012)

[{"Rank 1": "Garry's Mod"}, {"Rank 2": "Terraria"}, {"Rank 3": "Left 4 Dead 2"}]


In [34]:
UsersRecommend(2016)

[{"Rank 1": "Counter-Strike: Global Offensive"}, {"Rank 2": "Garry's Mod"}, {"Rank 3": "Unturned"}]


In [35]:
UsersRecommend(2016.3)

Please insert a valid year as an integer number.


In [36]:
UsersRecommend("Year")

Please insert a valid year as an integer number.


### 4. **UsersNotRecommend**(`year`: int) Function:

It returns the top 3 games LEAST recommended by users for the given `year` (reviews.recommend = False and negative comments).

Example of return: [{"Rank 1" : X}, {"Rank 2" : Y},{"Rank 3" : Z}]

For this function, we may need only the following columns of the df_F345 dataframe:

- item_id.
- recommend.
- sentiment_analysis.
- review_year.

In [37]:
df_f4 = df_F345[['item_id','title','recommend','sentiment_analysis','review_year']]
df_f4

Unnamed: 0,item_id,title,recommend,sentiment_analysis,review_year
0,10.0,Counter-Strike,True,2,2015
1,10.0,Counter-Strike,True,1,2015
2,10.0,Counter-Strike,True,2,2011
3,10.0,Counter-Strike,True,2,2014
4,10.0,Counter-Strike,True,2,2014
...,...,...,...,...,...
39144,521430.0,Super Switch,True,2,2016
39145,521570.0,You Have 10 Seconds 2,True,0,2016
39146,521570.0,You Have 10 Seconds 2,True,0,2016
39147,521990.0,Galactic Storm,True,2,2016


In [38]:
df_f4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39149 entries, 0 to 39148
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   item_id             39149 non-null  float64
 1   title               39149 non-null  string 
 2   recommend           39149 non-null  bool   
 3   sentiment_analysis  39149 non-null  int64  
 4   review_year         39149 non-null  int64  
dtypes: bool(1), float64(1), int64(2), string(1)
memory usage: 1.2 MB


This function returns the top 3 games MOST recommended by users for the given `year` (reviews.recommend = True and positive/neutral comments).

Example of return: [{"Rank 1" : item1}, {"Rank 2" : item2},{"Rank 3" : item3}]

To select the less recommended items we are going to use the following criteria:

1. Eliminate all the rows that have 'recommend = True'.
2. Select only the rows that have the review_year = year entered by the user. 
4. Select the top 3 items with the most quantity of 'recommend' = False (counter).
5. Organize the results as a JSON format.

**NOTE:** This function has some quality features that send messages to the user when the the user inserts a year that does not exist or write something different of integer.

We can do the step one, outside of the function:

In [39]:
mask = (df_f4['recommend'] == False) & (df_f4['sentiment_analysis'] == 0)
df_f4f = df_f4[mask]
df_f4 = df_f4f.reset_index(drop=True)
df_f4


Unnamed: 0,item_id,title,recommend,sentiment_analysis,review_year
0,20.0,Team Fortress Classic,False,0,2014
1,20.0,Team Fortress Classic,False,0,2016
2,220.0,Half-Life 2,False,0,2014
3,220.0,Half-Life 2,False,0,2014
4,240.0,Counter-Strike: Source,False,0,2015
...,...,...,...,...,...
1399,462930.0,AdVenture Communist,False,0,2016
1400,466910.0,Worm.is: The Game,False,0,2016
1401,488280.0,Gun Done,False,0,2016
1402,501760.0,Francisca,False,0,2016


Now, let's go to the function:

In [40]:
def UsersNotRecommend(year):
    # This function returns the top 3 games LEAST recommended by users for the given `year` (reviews.recommend = False and negative comments).
    # Example of return: [{"Rank 1" : X}, {"Rank 2" : Y},{"Rank 3" : Z}]

    # First, we need to make sure that the year inserted by the user is a number
    if type(year) == int: 
        # Now, let's select only the data for the year given by the user.
        mask = df_f4['review_year'] == year
        df_f4_review_year = df_f4[mask].reset_index(drop=True)

        # We need to verify if the Dataframe is not empty, which means that there are no reviews for that year:
        if not df_f4_review_year.empty:

            # We count the values considering the 'item_id'
            counter = df_f4_review_year['title'].value_counts()

            # Three east recommended games
            top_3_least_recommended = counter.head(3)

            #Now, we create a list of dictionaries for the Rank and item_id
            json_data = [{"Rank " + str(index + 1): item} for index, item in enumerate(top_3_least_recommended.index)]
            
            # Now, we create the message in JSON format
            json_message = json.dumps(json_data, indent=None, ensure_ascii=False)

            return print(json_message)
        else:
            return print("The year inserted has not reviews to calculate the ranking of the most recommended items. Please try with another year.")
    else:
        return print("Please insert a valid year as an integer number.")

In [41]:
UsersNotRecommend(2000)

The year inserted has not reviews to calculate the ranking of the most recommended items. Please try with another year.


In [42]:
UsersNotRecommend(2012)

[{"Rank 1": "PAYDAY™ The Heist"}, {"Rank 2": "The Kings' Crusade"}, {"Rank 3": "Red Faction®: Armageddon™"}]


In [43]:
UsersNotRecommend(2016)

[{"Rank 1": "No Man's Sky"}, {"Rank 2": "Counter-Strike: Global Offensive"}, {"Rank 3": "Robocraft"}]


In [44]:
UsersNotRecommend(2016.3)

Please insert a valid year as an integer number.


In [45]:
UsersNotRecommend("Year")

Please insert a valid year as an integer number.


### 5. **Sentiment_analysis**(`year`: int) Function:

Based on the release `year`, it returns a list with the count of user review records categorized with sentiment analysis.

Example of return: {"Negative": 182, "Neutral": 120, "Positive": 278}

For this function, we may need only the following columns of the df_F345 dataframe:

- sentiment_analysis.
- release_year.

In [46]:
df_f5 = df_F345[['sentiment_analysis','release_year']]
df_f5

Unnamed: 0,sentiment_analysis,release_year
0,2,2000
1,1,2000
2,2,2000
3,2,2000
4,2,2000
...,...,...
39144,2,2016
39145,0,2016
39146,0,2016
39147,2,2016


In [47]:
df_f5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39149 entries, 0 to 39148
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   sentiment_analysis  39149 non-null  int64
 1   release_year        39149 non-null  int64
dtypes: int64(2)
memory usage: 611.8 KB


To build this function we need to do the following:

1. We make  sure that the year inserted by the user is an integer.
2. We build a mask to filter the dataframe by the year inserted.
3. If the year inserted does not have values, we need to show a message to the user.
4. Then we need to apply the function .value_counts()
5. Organize the results as a JSON format.

**NOTE:** This function has some quality features that send messages to the user when the the user inserts a year that does not exist or write something different of integer.

In [48]:
def Sentiment_analysis(year):
    # Based on the release `year`, it returns a list with the count of user review records categorized with sentiment analysis.
    # Example of return: {"Negative": 182, "Neutral": 120, "Positive": 278}

    # First, we need to make sure that the year inserted by the user is a number
    if type(year) == int: 
        # Now, let's select only the data for the year given by the user.
        mask = df_f5['release_year'] == year
        df_f5_review_year = df_f5[mask].reset_index(drop=True)

        # We need to verify if the Dataframe is not empty, which means that there are no reviews for that year:
        if not df_f5_review_year.empty:
            # We count the values considering the 'sentiment_analysis'
            counter = df_f5_review_year['sentiment_analysis'].value_counts().sort_index()

            #Now, we create the dataframe that we are going to convert into JSON Format
            dataF5 = {
                "Negative": int(counter.get(0, 0)),
                "Neutral": int(counter.get(1, 0)),
                "Positive": int(counter.get(2, 0))
            }

            # Now, we create the message in JSON format
            json_message = json.dumps(dataF5, indent=None, ensure_ascii=False)

            return print(json_message)
        else:
            return print("The year inserted has not reviews to calculate the categories of sentiment analysis. Please try with another year.")
    else:
        return print("Please insert a valid year as an integer number.")

In [49]:
Sentiment_analysis(2015)

{"Negative": 900, "Neutral": 979, "Positive": 3693}


In [50]:
Sentiment_analysis(2000)

{"Negative": 7, "Neutral": 16, "Positive": 59}


In [51]:
Sentiment_analysis(2018)

The year inserted has not reviews to calculate the categories of sentiment analysis. Please try with another year.


In [52]:
Sentiment_analysis("Year")

Please insert a valid year as an integer number.


### 6. **Game_Recommendation**(`item_id`: int) Function:

By entering the `item_id`, we should receive a list with 5 recommended games similar to the one entered.


  Example of return: [{"Rec 1": "PAYDAY™ The Heist"}, {"Rec 2": "The Kings' Crusade"}, {"Rec 3": "Red Faction®: Armageddon™"}, {"Rec 4": "No Man's Sky"}, {"Rec 5": "Robocraft"} ]

For this purpose we are going to use a method called **Cosine Similarity**.

This is a mathematical measure used to determine the similarity between two vectors in a multi-dimensional space. It is often applied in various fields, including natural language processing, information retrieval, recommendation systems, and more. The cosine similarity method is particularly useful when dealing with high-dimensional data, such as text data, where traditional Euclidean distance measures may not be as effective.

Here's how cosine similarity works:

1. **Vector Representation:** First, we need to represent the objects we want to compare as vectors. These vectors can represent various attributes or features of the objects. In the context of text data, each vector typically represents a document or a piece of text, where each dimension corresponds to a specific word or term, and the value in each dimension can be based on various measures like term frequency, term frequency-inverse document frequency (TF-IDF), word embeddings, etc.

2. **Cosine Calculation:** To compute the cosine similarity between two vectors A and B, you calculate the cosine of the angle between them using the following formula:

    ```python
    cosine_similarity(A, B) = (A · B) / (||A|| * ||B||)
    ```

    (A · B) is the dot product of vectors A and B.
    
    ||A|| and ||B|| are the magnitudes (or Euclidean norms) of vectors A and B, respectively.

3. **Result:** The cosine similarity measure will produce a value between -1 and 1, where:
    - 1 indicates that the vectors are identical (perfect similarity).
    - 0 indicates that the vectors are orthogonal (no similarity).
    - -1 indicates that the vectors are diametrically opposed (perfect dissimilarity).


To develop this process we are going to use the Dataframe FML1 that we developed for this recommendation system.

In [53]:
df_FML1_for_Model = pd.read_parquet('FML1.parquet')
df_FML1_for_Model

Unnamed: 0,item_id,title,tags
0,761140.0,Lost Summoner Kitty,"['Strategy', 'Action', 'Indie', 'Casual', 'Sim..."
1,643980.0,Ironbound,"['Free to Play', 'Strategy', 'Indie', 'RPG', '..."
2,670290.0,Real Pool 3D - Poolians,"['Free to Play', 'Simulation', 'Sports', 'Casu..."
3,767400.0,弹炸人2222,"['Action', 'Adventure', 'Casual']"
4,772540.0,Battle Royale Trainer,"['Action', 'Adventure', 'Simulation', 'FPS', '..."
...,...,...,...
29917,745400.0,Kebab it Up!,"['Action', 'Indie', 'Casual', 'Violent', 'Adve..."
29918,773640.0,Colony On Mars,"['Strategy', 'Indie', 'Casual', 'Simulation']"
29919,733530.0,LOGistICAL: South Africa,"['Strategy', 'Indie', 'Casual']"
29920,610660.0,Russian Roads,"['Indie', 'Simulation', 'Racing']"


In [54]:
df_FML1_for_Model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29922 entries, 0 to 29921
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   item_id  29922 non-null  float64
 1   title    29922 non-null  string 
 2   tags     29922 non-null  object 
dtypes: float64(1), object(1), string(1)
memory usage: 701.4+ KB


Import the libraries:

In [55]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

We need to preprocess it the data, such as handling missing values and converting text columns (genres, tags, specs) into a single text column for vectorization. We can do this as follows:

In [56]:
# Combine text columns into a single column for vectorization
df_FML1_for_Model['combined_features'] = df_FML1_for_Model['tags']

# Fill missing values
df_FML1_for_Model['combined_features'].fillna('', inplace=True)

df_FML1_for_Model

Unnamed: 0,item_id,title,tags,combined_features
0,761140.0,Lost Summoner Kitty,"['Strategy', 'Action', 'Indie', 'Casual', 'Sim...","['Strategy', 'Action', 'Indie', 'Casual', 'Sim..."
1,643980.0,Ironbound,"['Free to Play', 'Strategy', 'Indie', 'RPG', '...","['Free to Play', 'Strategy', 'Indie', 'RPG', '..."
2,670290.0,Real Pool 3D - Poolians,"['Free to Play', 'Simulation', 'Sports', 'Casu...","['Free to Play', 'Simulation', 'Sports', 'Casu..."
3,767400.0,弹炸人2222,"['Action', 'Adventure', 'Casual']","['Action', 'Adventure', 'Casual']"
4,772540.0,Battle Royale Trainer,"['Action', 'Adventure', 'Simulation', 'FPS', '...","['Action', 'Adventure', 'Simulation', 'FPS', '..."
...,...,...,...,...
29917,745400.0,Kebab it Up!,"['Action', 'Indie', 'Casual', 'Violent', 'Adve...","['Action', 'Indie', 'Casual', 'Violent', 'Adve..."
29918,773640.0,Colony On Mars,"['Strategy', 'Indie', 'Casual', 'Simulation']","['Strategy', 'Indie', 'Casual', 'Simulation']"
29919,733530.0,LOGistICAL: South Africa,"['Strategy', 'Indie', 'Casual']","['Strategy', 'Indie', 'Casual']"
29920,610660.0,Russian Roads,"['Indie', 'Simulation', 'Racing']","['Indie', 'Simulation', 'Racing']"


Now, we use TF-IDF (Term Frequency-Inverse Document Frequency) vectorization to convert our text data into numerical vectors.

In [57]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df_FML1_for_Model['combined_features'])

Now, we calculate cosine similarities between all pairs of items in our dataset. We can use the cosine_similarity function from scikit-learn:

In [58]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

We can see that the calculation of the cosine similarities took 34 seconds. This is a high consumption of resources. Let's continue programming the recommendation function and check the results.

Now, we can create a function to recommend games based on a given item_id.

In [59]:
def Game_RecommendationCS(item_id, cosine_sim=cosine_sim):
# By entering the `item_id`, we should receive a list with 5 recommended games similar to the one entered.
    # We review the user enter a number
    if (type(item_id) == float) | (type(item_id) == int):
        # Now, let's review the item_id exists
        if item_id in df_FML1_for_Model['item_id'].values:
            # We capture the index of the game in the column 'item_id'
            game_index = df_FML1_for_Model[df_FML1_for_Model['item_id'] == item_id].index[0]
            # We calculate cosine similarity scores for the input game with all other games
            similar_scores = list(enumerate(cosine_sim[game_index]))
            # We sort the similar_scores based on the cosine similarity values in descending order
            similar_scores = sorted(similar_scores, key=lambda x: x[1], reverse=True)
            # We exclude the input game itself (most similar game will be the input game)
            similar_scores = similar_scores[1:6]
            
            # We create a list of recommended games in a JSON format.
            recommended_games = [{"Rec {}".format(i + 1): df_FML1_for_Model.iloc[score[0]]['title']} for i, score in enumerate(similar_scores)]
            
            return recommended_games
        else:
            return print('The item_id does not exists. Please try again.')
    else:
        return print("The item_id must be a number, please enter a new item_id")

In [60]:
Game_RecommendationCS(761140)

[{'Rec 1': 'World of Cinema - Directors Cut'},
 {'Rec 2': 'Aerial Destruction'},
 {'Rec 3': 'Cyber Complex'},
 {'Rec 4': "Trivia Vault: 1980's Trivia 2"},
 {'Rec 5': 'Trivia Vault: Science &amp; History Trivia 2'}]

In [61]:
Game_RecommendationCS(733530.0)

[{'Rec 1': 'Ticket to Ride - USA 1910'},
 {'Rec 2': 'Ticket to Ride - Europe'},
 {'Rec 3': 'Tribloos 2'},
 {'Rec 4': 'Hive - The Ladybug'},
 {'Rec 5': 'Bounders and Cads'}]

In [62]:
Game_RecommendationCS(610660.0)

[{'Rec 1': 'Car Mechanic Simulator 2015 - Total Modifications'},
 {'Rec 2': 'SoundTracks: The Train Set Game'},
 {'Rec 3': 'RC-AirSim - RC Model Airplane Flight Simulator'},
 {'Rec 4': 'Car Mechanic Simulator 2015 - DeLorean'},
 {'Rec 5': 'Russian Roads'}]

In [63]:
Game_RecommendationCS(10.0)

[{'Rec 1': 'Counter-Strike: Source'},
 {'Rec 2': 'Counter-Strike: Condition Zero'},
 {'Rec 3': 'Insurgency'},
 {'Rec 4': 'Umbrella Corps™/Biohazard Umbrella Corps™'},
 {'Rec 5': 'Counter-Strike: Global Offensive'}]

In [64]:
Game_RecommendationCS(200)

The item_id does not exists. Please try again.


In [65]:
Game_RecommendationCS("item")

The item_id must be a number, please enter a new item_id


We can see that we are recieving the best 5 recommendations based on a item_id entered by the user. However, we can imrpoe the velocity of our calculations without computing the cosine similarities. We can use an approximation method called **LSH (Locality-Sensitive Hashing)**.

**Locality-Sensitive Hashing (LSH)** is a technique used in data mining and information retrieval to approximate similarity between data points in a high-dimensional space. It's often used as an alternative to traditional similarity measures like cosine similarity when dealing with large datasets where exact similarity computation becomes computationally expensive.

For this method we use something called **Random Binary Projections**.

**Random Binary Projections** are a specific type of hash function used in LSH. These hash functions work by randomly generating a set of binary values (0s and 1s) and using them to project the data points onto a lower-dimensional space. The randomness of these projections is a key factor in LSH because it introduces a controlled level of randomness that helps group similar data points together with a higher probability.

Here's how we can program it on Python:

First, we need to import the libraries:

In [66]:
pip install nearpy

Note: you may need to restart the kernel to use updated packages.


In [67]:
import numpy as np
from nearpy import Engine
from nearpy.hashes import RandomBinaryProjections
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

As in the model of cosine-similarities, we need to calculate the vectors and fill the NaN with ''.

In [68]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df_FML1_for_Model['combined_features'].fillna(''))

In [69]:
# We set the number of dimensions in the TF-IDF matrix
num_dimensions = tfidf_matrix.shape[1]

# Also, we set the number of hash functions (random binary projections)
num_hash_functions = 4  # We can adjust this value according to our needs

# Create the LSH engine
engine = Engine(num_dimensions, lshashes=[RandomBinaryProjections('rbp', num_hash_functions)])

In the code above we can see that there is a number of hash_functions. We need to calculate this hiperarameter considering this balance:

**Higher `num_hash_functions` (More Hash Functions):**

- **Pros:**
    **Increased Precision:** With more hash functions, it becomes less likely for dissimilar data points to hash to the same bucket. This can lead to higher precision because there are fewer false positives when searching for similar items.
- **Cons:**
    **Reduced Recall:** While precision may increase, recall (the ability to find all similar items) may decrease. More hash functions mean that similar items must match on multiple hash functions to be considered similar, which could lead to some similar items being missed.

**Lower `num_hash_functions` (Fewer Hash Functions):**

- **Pros:**
    **Increased Recall:** With fewer hash functions, it becomes easier for similar data points to hash to the same bucket, increasing the chances of finding all similar items.
- **Cons:**
    **Reduced Precision:** However, a lower number of hash functions can also result in more false positives because dissimilar items may accidentally hash to the same bucket.

In this case we have chosen 4. Let's see if we have a good approximation of what we obtained with the cosine similarity model.

Now, we iterate through rows of the TF-IDF matrix and associate them with item IDs:

In [70]:
# Loop through each row and its index in the TF-IDF matrix
for i, row in enumerate(tfidf_matrix):

    # Extract the item_id from the corresponding row in the DataFrame 'df_FML1_for_Model'
    game_id = df_FML1_for_Model.iloc[i]['item_id']

    # Store the TF-IDF vector as a flattened array in the LSH engine, associated with the game ID
    engine.store_vector(row.toarray().flatten(), data=game_id)

In [71]:
def Game_Recommendation(item_id, engine=engine):
    """
    Recommends similar games using Locality-Sensitive Hashing (LSH).

    Parameters:
        item_id (int): The ID of the game for which recommendations are requested.
        engine (LSH Engine): The LSH engine used for similarity search.

    Returns:
        list: A list of recommended games in JSON format.
    """
    
    # By entering the `item_id`, we should receive a list with 5 recommended games similar to the one entered.
    # We review the user enter a number
    if (type(item_id) == float) | (type(item_id) == int):
        # Now, let's review the item_id exists
        if item_id in df_FML1_for_Model['item_id'].values:
                # We Get the LSH index of the input game

                # Query the TF-IDF matrix to get the TF-IDF vector for the input game
                query = tfidf_matrix[df_FML1_for_Model['item_id'] == item_id].toarray().flatten()

                # Use LSH to find similar games (neighbors) to the input game
                neighbors = engine.neighbours(query)

                # Recommendations based on LSH

                # Extract the game IDs of the recommended games, excluding the input game, and limit to the top 5
                recommended_game_ids = [neighbor[1] for neighbor in neighbors if neighbor[1] != item_id][:5]

                # Filter the DataFrame to get details of the recommended games (titles)
                recommended_games = df_FML1_for_Model[df_FML1_for_Model['item_id'].isin(recommended_game_ids)][['title']]

                # Construct the list of recommendations in JSON format
                result = [{'Rec {}'.format(i + 1): game} for i, game in enumerate(recommended_games['title'])]

                return result
        else:
            return print('The item_id does not exists. Please try again.')
    else:
        return print("The item_id must be a number, please enter a new item_id")

In [72]:
Game_Recommendation(761140)

[{'Rec 1': 'World of Cinema - Directors Cut'},
 {'Rec 2': 'Aerial Destruction'},
 {'Rec 3': 'Cyber Complex'},
 {'Rec 4': "Trivia Vault: 1980's Trivia 2"},
 {'Rec 5': 'Trivia Vault: Science &amp; History Trivia 2'}]

In [73]:
Game_Recommendation(733530.0)

[{'Rec 1': 'Bumbledore'},
 {'Rec 2': 'Ticket to Ride - USA 1910'},
 {'Rec 3': 'Ticket to Ride - Europe'},
 {'Rec 4': 'Tribloos 2'},
 {'Rec 5': 'Hive - The Ladybug'}]

In [74]:
Game_Recommendation(610660.0)

[{'Rec 1': 'Car Mechanic Simulator 2015 - Maserati'},
 {'Rec 2': 'Car Mechanic Simulator 2015 - Total Modifications'},
 {'Rec 3': 'SoundTracks: The Train Set Game'},
 {'Rec 4': 'RC-AirSim - RC Model Airplane Flight Simulator'},
 {'Rec 5': 'Car Mechanic Simulator 2015 - DeLorean'}]

In [75]:
Game_Recommendation(10)

[{'Rec 1': 'ARMA: Combat Operations'},
 {'Rec 2': 'Warface'},
 {'Rec 3': 'MechWarrior Online™'},
 {'Rec 4': "Tom Clancy's Rainbow Six® Siege - Year 3 Pass"},
 {'Rec 5': 'Quake Live™'}]

In [76]:
Game_Recommendation(10.3)

The item_id does not exists. Please try again.


In [77]:
Game_Recommendation("Item")

The item_id must be a number, please enter a new item_id


We can see that we have almost the same results between the two models, and with the LSH approximation we do not consume so much resources as with the cosine-similariies method. 

In [None]:
import ast
import pandas as pd

# Select the first 5 rows from the 'tags_Evaluation' DataFrame
rows = tags_Evaluation.iloc[0:5, 0]

# Convert strings into lists using ast.literal_eval
rows = rows.apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Find the intersection of sets among all rows
common_elements = set(rows.iloc[0]).intersection(*rows.iloc[1:])

# Convert the set intersection into a list
common_elements_list = list(common_elements)

# Display the common elements
print(common_elements_list)

We can see that there are 4 tags that are repeated through all the items recommended by the model and the item_id entered by the user.

Now we are going to check how many distinct tags we have for all the items:

In [None]:
import ast

# Concatenate all word lists into a single list
rows = tags_Evaluation.iloc[0:5, 0]
all_words = [word for row in rows for word in ast.literal_eval(row) if isinstance(row, str)]

# Create a set of unique words
unique_words = set(all_words)

# Get the count of unique words
unique_word_count = len(unique_words)

# Print the count of unique words
print(f'Number of distinct tags in the set of rows: {unique_word_count}')

Now, we calculate the **Recall** for the ranodm evaluation above:

In [None]:
# Call the Game_Recommendation function for the user item in question
user_item_id = 10  # Replace this with the ID of the user item you want to evaluate
recommendations = Game_Recommendation(user_item_id)

# Extract recommendation titles
titles = [rec[f'Rec {i}'] for i, rec in enumerate(recommendations, start=1)]

# Initialize variables to keep track of metrics
total_common_features_identified = 0
total_common_features_user = 0

for title in titles:
    # Look up corresponding tags for each title in df_FML1_for_Model
    matching_row = df_FML1_for_Model[df_FML1_for_Model['title'] == title]
    if not matching_row.empty:
        tags = set(matching_row.iloc[0]['tags'].split(', '))
        
        # Calculate actual common features with the user's item
        user_common_features = set(df_FML1_for_Model[df_FML1_for_Model['item_id'] == user_item_id]['tags'].values[0].split(', '))
        
        # Calculate metrics for this recommendation
        num_common_features_identified = len(tags.intersection(user_common_features))
        num_common_features_user = len(user_common_features)
        
        # Update the totals
        total_common_features_identified += num_common_features_identified
        total_common_features_user += num_common_features_user

# Calculate Recall for this user item
if total_common_features_user > 0:
    recall = total_common_features_identified / total_common_features_user
else:
    recall = 0.0

# Print the Recall
print(f"Recall for user_item_id {user_item_id}: {recall:.4f}")

We can see the **Recall is 0.48**. This means that the model has correctly identified 48% of the common tags between the item entered by the user and the recommended items. In other words, the model has found 48% of the tags that should have been recommended to the user with item ID 10 were indeed included in the recommendations. However, we know that all the items have differences that makes them unique and valuable. So this is a good result since the recommendations are not exactly the same, but has some similarities in a balanced way (48%). It allows the user to explore new concepts and characteristics that could be surprisely pleasant.

Now, we calculate the **Precision** for the random evaluation above:

In [None]:
# Call the Game_Recommendation function for the user item in question
user_item_id = 10  # Replace this with the ID of the user item you want to evaluate
recommendations = Game_Recommendation(user_item_id)

# Extract recommendation titles
titles = [rec[f'Rec {i}'] for i, rec in enumerate(recommendations, start=1)]

# Initialize variables to keep track of metrics
total_common_features_identified = 0
total_identified_features = 0

for title in titles:
    # Look up corresponding tags for each title in df_FML1_for_Model
    matching_row = df_FML1_for_Model[df_FML1_for_Model['title'] == title]
    if not matching_row.empty:
        tags = set(matching_row.iloc[0]['tags'].split(', '))
        
        # Calculate common features with the user item
        user_common_features = set(df_FML1_for_Model[df_FML1_for_Model['item_id'] == user_item_id]['tags'].values[0].split(', '))
        
        # Calculate metrics for this recommendation
        num_common_features_identified = len(tags.intersection(user_common_features))
        num_identified_features = len(tags)
        
        # Update the totals
        total_common_features_identified += num_common_features_identified
        total_identified_features += num_identified_features

# Calculate the precision for this user item
if total_identified_features > 0:
    precision = total_common_features_identified / total_identified_features
else:
    precision = 0.0

# Print the precision
print(f"Precision for user_item_id {user_item_id}: {precision:.4f}")

We can see the **Precision is 0.55**. "Precision" refers to how many of the recommended items are actually relevant to the user. A precision score of 0.55 means that 55% of the recommended items were accurate and relevant to the user with item_id 10. A higher precision score indicates a better quality of recommendations because it means that a larger proportion of the recommended items were actually of interest to the user.

Now we calcualte the **F1-Score** for this random evaluation:

In [None]:
# Calculate the F1-Score
f1_score = 2 * (precision * recall) / (precision + recall)

print(f"F1-Score for item_id {user_item_id}: {f1_score:.4f}")

We can see the **F1-Score is 0.51**. This indicates a balance between precision and recall in the evaluation of a model. The F1-Score is a metric that combines both metrics, and its value varies between 0 and 1, where:

- An F1-Score of 0 means the model performs very poorly in terms of balancing precision and recall.
- An F1-Score of 1 means the model performs perfectly in terms of balancing precision and recall.

In this case, an F1-Score of 0.51 suggests that the model has a moderate performance in identifying common features among the recommended items. That is, the model is neither very precise nor has good recall but achieves an acceptable balance between both metrics. Overall, an F1-Score of 0.51 suggests that the model has room for improvement in terms of its ability to identify common tags. However, using an approximation of the cosine-similatiry method and only one column of our entire dataset (tags), we can say that the reommended items given by the model are a goog approximation to what the users may want. Not exactly the same, but with similarities. This was a good balance bewteen the good results obtained and resoruces optimization. 

Finally, we calculate the **Jaccard Similarity Coefficient** for each recommended item.

In [None]:
# Recommended Items Data
item_recommended_1 = ['FPS', 'Realistic', 'Tactical', 'Multiplayer', 'Action', 'Shooter', 'Military', 'Team-Based', 'Co-op', 'Strategy', 'First-Person', 'Online Co-Op', 'War', 'Indie', 'Competitive', 'Simulation', 'PvP', 'Atmospheric', 'Singleplayer', 'Adventure']
item_recommended_2 = ['Action', 'FPS', 'Classic', 'First-Person', "1990's", 'Shooter', 'Retro']
item_recommended_3 = ['FPS', 'Multiplayer', 'Action', 'Shooter', 'Team-Based', 'War', 'First-Person', 'Military', 'Destruction', 'Singleplayer', 'Tactical', 'Co-op', 'Atmospheric', 'Comedy', 'Classic', 'Open World', 'Strategy', 'Adventure', 'Simulation', 'Sandbox']
item_recommended_4 = ['Action', 'FPS', 'Multiplayer', 'Shooter', 'Team-Based', 'Tactical', 'First-Person', 'Competitive', 'PvP', 'Military', 'War', 'e-sports', 'Moddable', 'Strategy', 'Difficult', 'Singleplayer', 'Survival', 'Simulation', 'Atmospheric', 'Sandbox']
item_recommended_5 = ['Action', 'FPS', 'Shooter', 'Multiplayer', 'Singleplayer', 'Tactical', 'First-Person', 'Competitive', 'Team-Based', 'Classic', 'Strategy', 'Military', 'Online Co-Op', 'Adventure', 'Open World', 'Survival', 'Atmospheric', 'Dark', 'Simulation', 'Horror']

# User-Entered Item
user_entered_item = ['Action', 'FPS', 'Multiplayer', 'Shooter', 'Classic', 'Team-Based', 'Competitive', 'First-Person', 'Tactical', "1990's", 'e-sports', 'PvP', 'Military', 'Strategy', 'Score Attack', 'Survival', 'Assassin', '1980s', 'Ninja', 'Tower Defense']

# Convert the feature sets into sets
set_recommended_1 = set(item_recommended_1)
set_recommended_2 = set(item_recommended_2)
set_recommended_3 = set(item_recommended_3)
set_recommended_4 = set(item_recommended_4)
set_recommended_5 = set(item_recommended_5)
set_user_entered_item = set(user_entered_item)

# Calculate intersections between the user's set and the recommended sets
intersection_1 = len(set_recommended_1.intersection(set_user_entered_item))
intersection_2 = len(set_recommended_2.intersection(set_user_entered_item))
intersection_3 = len(set_recommended_3.intersection(set_user_entered_item))
intersection_4 = len(set_recommended_4.intersection(set_user_entered_item))
intersection_5 = len(set_recommended_5.intersection(set_user_entered_item))

# Calculate unions between the user's set and the recommended sets
union_1 = len(set_recommended_1.union(set_user_entered_item))
union_2 = len(set_recommended_2.union(set_user_entered_item))
union_3 = len(set_recommended_3.union(set_user_entered_item))
union_4 = len(set_recommended_4.union(set_user_entered_item))
union_5 = len(set_recommended_5.union(set_user_entered_item))

# Calculate Jaccard Similarity coefficient for each recommended set
jaccard_similarity_1 = intersection_1 / union_1
jaccard_similarity_2 = intersection_2 / union_2
jaccard_similarity_3 = intersection_3 / union_3
jaccard_similarity_4 = intersection_4 / union_4
jaccard_similarity_5 = intersection_5 / union_5

print(f"Jaccard Similarity 1: {jaccard_similarity_1}")
print(f"Jaccard Similarity 2: {jaccard_similarity_2}")
print(f"Jaccard Similarity 3: {jaccard_similarity_3}")
print(f"Jaccard Similarity 4: {jaccard_similarity_4}")
print(f"Jaccard Similarity 5: {jaccard_similarity_5}")

The **Jaccard Similarity coefficients** calculated for each recommended item and the item entered by the user, represent how much similarity or overlap exists between the tags of the items.

- Jaccard Similarity 1: 0.379 means that 37.9% of the tags in recommended item 1 match the tags in the user's item.

- Jaccard Similarity 2: 0.285 means that 28.5% of the tags in recommended set 2 match the tags in the user's item.

- Jaccard Similarity 3: 0.333 means that 33.3% of the tags in recommended set 3 match the tags in the user's item.

- Jaccard Similarity 4: 0.481 means that 48.1% of the tags in recommended set 4 match the tags in the user's item.

- Jaccard Similarity 5: 0.428 means that 42.8% of the tags in recommended set 5 match the tags in the user's item.

In general, these values indicate the degree of similarity between the tags of the recommended itms and the user's items. The higher the Jaccard Similarity value, the greater the overlap of tags, and therefore, the higher the similarity between the items. In this case, the values range from approximately 28% to 48%, suggesting that there is some similarity between the tags but also some differences in each recommended item compared to the user's item. For us, this is good, not too similar, bot also not too different. 

Now, let's calculate the average of these metrics for 10 random items entered by the user and its corresponding recommendations:

In [None]:
# Use the sample function to select 30 random elements from the 'item_id' column
random_selection = df_FML1_for_Model['item_id'].sample(n=50)

In [None]:
import pandas as pd
import numpy as np

# Initialize lists to store Recall, Precision, F1-Score, and Jaccard Similarity scores for each item
recall_scores = []
precision_scores = []
f1_scores = []
jaccard_similarity_scores = []

for user_item_id in random_selection:
    # Apply the Game_RecommendationLSH function and extract recommendations
    recommendations = Game_Recommendation(user_item_id)
    
    # Extract recommendation titles
    titles = [rec[f'Rec {i}'] for i, rec in enumerate(recommendations, start=1)]
    
    # Initialize variables to track common features
    total_common_features_identified = 0
    total_common_features_user = 0
    total_identified_features = 0
    jaccard_similarities = []  # To store Jaccard Similarity values in this iteration
    
    for title in titles:
        # Look up corresponding tags for each title in df_FML1_for_Model
        matching_row = df_FML1_for_Model[df_FML1_for_Model['title'] == title]
        if not matching_row.empty:
            tags = set(matching_row.iloc[0]['tags'].split(', '))
            
            # Calculate actual common features for the user's item
            user_common_features = set(df_FML1_for_Model[df_FML1_for_Model['item_id'] == user_item_id]['tags'].values[0].split(', '))
            
            # Calculate metrics for this recommendation
            num_common_features_identified = len(tags.intersection(user_common_features))
            num_common_features_user = len(user_common_features)
            num_identified_features = len(tags)
            
            # Update total counts
            total_common_features_identified += num_common_features_identified
            total_common_features_user += num_common_features_user
            total_identified_features += num_identified_features
            
            # Calculate Jaccard Similarity for this recommendation
            jaccard_similarity = num_common_features_identified / (num_common_features_user + num_identified_features - num_common_features_identified)
            jaccard_similarities.append(jaccard_similarity)
    
    # Calculate Recall for this item
    recall = total_common_features_identified / total_common_features_user
    
    # Calculate Precision for this item
    precision = total_common_features_identified / total_identified_features
    
    # Calculate F1-Score for this item
    if precision + recall > 0:
        f1_score = 2 * (precision * recall) / (precision + recall)
    else:
        f1_score = 0.0
    
    # Calculate the average Jaccard Similarity for this iteration
    avg_jaccard_similarity = sum(jaccard_similarities) / len(jaccard_similarities)
    
    # Add Recall, Precision, F1-Score, and average Jaccard Similarity to their respective lists
    recall_scores.append(recall)
    precision_scores.append(precision)
    f1_scores.append(f1_score)
    jaccard_similarity_scores.append(avg_jaccard_similarity)  # Store the average for this iteration

# Calculate the average Recall, Precision, F1-Score, and Jaccard Similarity over the 10 items
average_recall = sum(recall_scores) / len(recall_scores)
average_precision = sum(precision_scores) / len(precision_scores)
average_f1_score = sum(f1_scores) / len(f1_scores)
average_jaccard_similarity = sum(jaccard_similarity_scores) / len(jaccard_similarity_scores)

# Calculate the standard deviation of Recall, Precision, F1-Score, and Jaccard Similarity
std_recall = np.std(recall_scores)
std_precision = np.std(precision_scores)
std_f1_score = np.std(f1_scores)
std_jaccard_similarity = np.std(jaccard_similarity_scores)

# Print the results in the desired format, including the standard deviation
print(f"Average Recall: {average_recall:.4f} +/- {std_recall:.4f} ")
print(f"Average Precision: {average_precision:.4f} +/- {std_precision:.4f} ")
print(f"Average F1-Score: {average_f1_score:.4f} +/- {std_f1_score:.4f} ")
print(f"Average Jaccard Similarity: {average_jaccard_similarity:.4f} +/- {std_jaccard_similarity:.4f} ")


The model demonstrates excellent performance based on the metrics observed. Conducting 50 iterations where the recommendation system successfully operates for a specific item_id, provides a robust sample for approximating the model's performance across the entire population of items. 

**Recall:** Recall measures the proportion of relevant items that were recommended by our system. An average recall of 0.5982 suggests that, on average, our system is able to recommend about 59.82% of the relevant items. The +/- 0.2933 represents the standard deviation or uncertainty in this measure across our 50 random item selections. In other words, there's some variability in how well our system performs from one random selection to another.

**Precision:** Precision measures the proportion of recommended items that are actually relevant. An average precision of 0.6197 suggests that, on average, about 61.97% of the recommended items are relevant to the user. The standard deviation (+/- 0.2735) indicates the variability in precision across different random selections.

**F1-Score:** The F1-Score is the harmonic mean of precision and recall and is a balance between the two. An average F1-Score of 0.6025 suggests a reasonable balance between precision and recall in our recommendations. The standard deviation (+/- 0.2862) indicates variability across different random selections.

**Average Jaccard Similarity:** Jaccard Similarity measures the similarity between the set of recommended items and the set of relevant items. An average Jaccard Similarity of 0.5423 suggests that, on average, there is a 54.23% similarity between the recommended items and the relevant items. The standard deviation (+/- 0.3309) indicates the variability in similarity across different random selections.

In summary, our content-based recommendation system seems to have a reasonable level of performance, with a decent balance between precision and recall. However, there is some variability in these metrics across different random selections, which may indicate that the system's performance could be further optimized. These metrics provide insights into how well our system is capturing relevant items and how precise its recommendations are. By the other hand, as we exposed before, there must be a good balance between similarities and differences to avoid showing exactly the same content, but keeping some differences that can cause a pleasant surprise in the user. The metrics above show that balance and that is exactly what we want. Also, it is good to notice that we did not use the cosine-similarity method due to it consumes high quantity of resources, so we decided to use an approximation called LSH (Locality-Sensitive Hashing), that converts our information using hash functions in order to consume less resources but keeping the differences and similarities. For sure there is always room for improvement, but this basic model was optimized to consume the less resources with the most of performance. Finally, the tags showed that they are a good way to describe an item and look for similarities in the recommendation system. 

In [90]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay



In [81]:


X = df_F345[['sum_playtime_forever', 'sentiment_analysis']]
Y = df_F345['item_id'] 


X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

In [82]:
# Print the shape of X_train, X_test, y_train, y_test
print("Size of x_train :",(X_train.shape))
print("Size of y_train :",(y_train.shape))
print("Size of X_test :",(X_test.shape))
print("Size of y_test :",(y_test.shape))

Size of x_train : (31319, 2)
Size of y_train : (31319,)
Size of X_test : (7830, 2)
Size of y_test : (7830,)


In [84]:


lreg = LinearRegression()
lreg.fit(X_train, y_train)
lreg_pred = lreg.predict(X_test)
mse = mean_squared_error(y_test, lreg_pred)
print("Mean Squared Error: {:.2f}".format(mse)) # There were some values related to multiclass and mixed continuous values that y i was not
                                               # able to apply accuracy score then applied mse to deal with this problem


Mean Squared Error: 15532304622.17


In [86]:
from sklearn.metrics import r2_score

# Your existing code
lreg = LinearRegression()
lreg.fit(X_train, y_train)
lreg_pred = lreg.predict(X_test)
r2 = r2_score(y_test, lreg_pred)
print("R2 Score: {:.2f}".format(r2*100))

R2 Score: 8.30


In [87]:
logreg = LogisticRegression() # Creating an instance of logistic regression classifier.
logreg.fit(X_train, y_train) # here, we train the logistic regression model using the training data.
logreg_predict =  logreg.predict(X_test) # after training the model, here, we are using trained model to predict on the test data
logreg_acc = accuracy_score(logreg_predict, y_test) # here, calculating the accuracy of the model’s predictions on the test data.
print("Test accuracy : {:.2f}%".format(logreg_acc*100))

Test accuracy : 8.63%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [91]:
# Confusion_matrix is a table that summarizes the performance of a classification model on a set of test data.
print(confusion_matrix(y_test, logreg_predict)) 
print("\n")
# Classification_report provides a detailed summary of the model’s performance, including precision, recall, F1-score, and support for each class.
print(classification_report(y_test, logreg_predict))

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


              precision    recall  f1-score   support

        10.0       0.00      0.00      0.00         7
        20.0       0.00      0.00      0.00         2
        30.0       0.00      0.00      0.00         1
        60.0       0.00      0.00      0.00         3
        70.0       0.00      0.00      0.00        14
        80.0       0.00      0.00      0.00         1
       130.0       0.00      0.00      0.00         1
       220.0       0.00      0.00      0.00        42
       240.0       0.00      0.00      0.00        47
       280.0       0.00      0.00      0.00         1
       300.0       0.00      0.00      0.00         7
       320.0       0.00      0.00      0.00         3
       340.0       0.00      0.00      0.00         2
       380.0       0.00      0.00      0.00         5
       400.0       0.00      0.00      0.00        19
       420.0 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from matplotlib import style
style.use('ggplot')
style.use("classic")

cm = confusion_matrix(y_test, logreg_predict, labels=logreg.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels = logreg.classes_)
disp.plot()
