In [None]:
import pandas as pd # Used for data manipulation and analysis, especially with DataFrames
import numpy as np  # Used for numerical operations and handling arrays
import matplotlib.pyplot as plt # Used for basic plotting of graphs
import seaborn as sns # Used for creating advanced statistical plots
import plotly.express as px  # Used for creating interactive plots


# Data Preprocessing and Scaling
from sklearn.preprocessing import LabelEncoder,StandardScaler  # Used for encoding categorical data into numerical format
from sklearn.preprocessing import StandardScaler  # Used for scaling numerical features to have a mean of 0 and standard deviation of 1


from scipy.stats import zscore   # Used to calculate the Z-score of a dataset, which helps in identifying outliers




In [None]:

df = pd.read_csv('/content/video_game_sales.csv')

In [None]:
# Returns a tuple representing the dimensions of the DataFrame 'df'.

# Useful for quickly checking the size of the dataset and understanding its structure.

df.shape

(64016, 14)

In [None]:
df.head()

Unnamed: 0,img,title,console,genre,publisher,developer,critic_score,total_sales,na_sales,jp_sales,pal_sales,other_sales,release_date,last_update
0,/games/boxart/full_6510540AmericaFrontccc.jpg,Grand Theft Auto V,PS3,Action,Rockstar Games,Rockstar North,9.4,20.32,6.37,0.99,9.85,3.12,2013-09-17,
1,/games/boxart/full_5563178AmericaFrontccc.jpg,Grand Theft Auto V,PS4,Action,Rockstar Games,Rockstar North,9.7,19.39,6.06,0.6,9.71,3.02,2014-11-18,2018-01-03
2,/games/boxart/827563ccc.jpg,Grand Theft Auto: Vice City,PS2,Action,Rockstar Games,Rockstar North,9.6,16.15,8.41,0.47,5.49,1.78,2002-10-28,
3,/games/boxart/full_9218923AmericaFrontccc.jpg,Grand Theft Auto V,X360,Action,Rockstar Games,Rockstar North,,15.86,9.06,0.06,5.33,1.42,2013-09-17,
4,/games/boxart/full_4990510AmericaFrontccc.jpg,Call of Duty: Black Ops 3,PS4,Shooter,Activision,Treyarch,8.1,15.09,6.18,0.41,6.05,2.44,2015-11-06,2018-01-14


#DATA PRE PROCESSING
->
Data preprocessing is the process of preparing raw data for analysis by transforming it into a clean, consistent, and usable format. It is an essential step in data science and machine learning, as raw data is often incomplete, inconsistent, or noisy, which can negatively impact model performance and analysis quality. Preprocessing ensures that the data is suitable for further analysis, modeling, and visualization.

In [None]:
#deleting unwanted column

df.drop(columns = ['img'], inplace = True)

# Removes the column named 'img' from the DataFrame 'df'.
# 'inplace=True' makes this change directly in the DataFrame without needing to reassign it.
# This is useful for removing unnecessary data that does not contribute to analysis or modeling.

In [None]:
#checking null values

df.isna().sum()

# Checks for missing (null) values in each column of the DataFrame 'df'.
# 'isna()' identifies null values, and 'sum()' aggregates the count of nulls per column.
# This helps in understanding the extent of missing data, which is crucial for data cleaning.

Unnamed: 0,0
title,0
console,0
genre,0
publisher,0
developer,17
critic_score,57338
total_sales,45094
na_sales,51379
jp_sales,57290
pal_sales,51192


In [None]:
df.info()

# Shows the number of non-null values, data types of each column, and memory usage.
# Useful for quickly understanding the structure of the dataset, identifying columns with missing values,
# and checking data types for compatibility with analysis or modeling tasks.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64016 entries, 0 to 64015
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         64016 non-null  object 
 1   console       64016 non-null  object 
 2   genre         64016 non-null  object 
 3   publisher     64016 non-null  object 
 4   developer     63999 non-null  object 
 5   critic_score  6678 non-null   float64
 6   total_sales   18922 non-null  float64
 7   na_sales      12637 non-null  float64
 8   jp_sales      6726 non-null   float64
 9   pal_sales     12824 non-null  float64
 10  other_sales   15128 non-null  float64
 11  release_date  56965 non-null  object 
 12  last_update   17879 non-null  object 
dtypes: float64(6), object(7)
memory usage: 6.3+ MB


In [None]:
# Check the column names to confirm

print("Column names in the dataset:", df.columns)

# Prints the names of all columns in the DataFrame 'df'.
# This is useful for confirming the exact names of each column,
# which is helpful when selecting, renaming, or modifying specific columns in the dataset.

Column names in the dataset: Index(['title', 'console', 'genre', 'publisher', 'developer', 'critic_score',
       'total_sales', 'na_sales', 'jp_sales', 'pal_sales', 'other_sales',
       'release_date', 'last_update'],
      dtype='object')


ENCODING

->Encoding in the context of data preprocessing and machine learning is the process of converting categorical (non-numeric) data into a numerical format so that machine learning algorithms can interpret it.

In [None]:
# Define the columns for encoding based on the actual dataset columns

columns_to_encode = ['platform', 'genre', 'publisher']

# Specifies the list of categorical columns that need to be encoded for modeling.
# Encoding transforms categorical variables into numerical form, making them usable in machine learning algorithms.

label_encoders = {}

# Initializes an empty dictionary to store LabelEncoder objects for each categorical column.
# This allows us to encode multiple columns and keep track of the encoders used,

In [None]:
# Check if each column exists in the dataset before encoding

for column in columns_to_encode:
    if column in df.columns: # Verifies that the specified column is present in the DataFrame
        le = LabelEncoder()   # Creates a new LabelEncoder instance for encoding
        df[column] = le.fit_transform(df[column])   # Encodes the categorical values in the column to numeric values
        label_encoders[column] = le # Stores the LabelEncoder in the dictionary for potential reverse transformations
    else:
        print(f"Column '{column}' not found in the dataset.") # Alerts if a specified column is missing in the DataFrame


        # This loop ensures only existing columns are encoded, avoiding errors if a specified column is absent.
        # Storing encoders allows decoding later, which may be useful in interpreting the encoded data.


Column 'platform' not found in the dataset.


In [None]:
print("Encoding complete. Here’s a sample of the data:\n", df.head())

# Prints a message indicating that the encoding process is complete.
# Then displays the first few rows (by default, 5 rows) of the DataFrame 'df' using the 'head()' method.
# This is useful for quickly inspecting the changes made to the data, ensuring that the categorical columns
# have been correctly encoded into numerical values.

Encoding complete. Here’s a sample of the data:
                          title console  genre  publisher       developer  \
0           Grand Theft Auto V     PS3      0       2445  Rockstar North   
1           Grand Theft Auto V     PS4      0       2445  Rockstar North   
2  Grand Theft Auto: Vice City     PS2      0       2445  Rockstar North   
3           Grand Theft Auto V    X360      0       2445  Rockstar North   
4    Call of Duty: Black Ops 3     PS4     15        101        Treyarch   

   critic_score  total_sales  na_sales  jp_sales  pal_sales  other_sales  \
0           9.4        20.32      6.37      0.99       9.85         3.12   
1           9.7        19.39      6.06      0.60       9.71         3.02   
2           9.6        16.15      8.41      0.47       5.49         1.78   
3           NaN        15.86      9.06      0.06       5.33         1.42   
4           8.1        15.09      6.18      0.41       6.05         2.44   

  release_date last_update  
0   2013

In [None]:
# Print the column names in the DataFrame (df) to see the structure of the dataset
print("Columns in the dataset:", df.columns)


Columns in the dataset: Index(['title', 'console', 'genre', 'publisher', 'developer', 'critic_score',
       'total_sales', 'na_sales', 'jp_sales', 'pal_sales', 'other_sales',
       'release_date', 'last_update'],
      dtype='object')


standard scaler

->The Standard Scaler is a preprocessing technique that transforms data to have a mean of 0 and a standard deviation of 1. It is commonly used to standardize features in machine learning, ensuring that all features contribute equally to the model by eliminating scale differences.

In [None]:
# Verify and scale only existing columns
# Initialize the StandardScaler to scale numerical features
scaler = StandardScaler()


# List of columns to be scaled (only those that exist in the dataset)
scaled_columns = ['na_sales', 'eu_sales', 'jp_sales', 'other_sales', 'global_sales']

In [None]:
# Check which columns are available in the DataFrame and scale only those
existing_columns = [col for col in scaled_columns if col in df.columns and df[col].dtype in ['float64', 'int64']]

In [None]:
# Check if any columns are valid columns to sacle

if existing_columns:

  # Apply the StandardScaler to the existing columns and update the DataFrame with the scaled values
    df[existing_columns] = scaler.fit_transform(df[existing_columns])
else:

  # If no valid columns are found, print a message
    print("No valid columns to scale.")

In [None]:
# Print a message indicating that scaling is complete
# Display the first few rows of the DataFrame to show a sample of the scaled data

print("Scaling complete. Here’s a sample of the data:\n", df.head())

Scaling complete. Here’s a sample of the data:
                          title console  genre  publisher       developer  \
0           Grand Theft Auto V     PS3      0       2445  Rockstar North   
1           Grand Theft Auto V     PS4      0       2445  Rockstar North   
2  Grand Theft Auto: Vice City     PS2      0       2445  Rockstar North   
3           Grand Theft Auto V    X360      0       2445  Rockstar North   
4    Call of Duty: Black Ops 3     PS4     15        101        Treyarch   

   critic_score  total_sales   na_sales  jp_sales  pal_sales  other_sales  \
0           9.4        20.32  12.339666  5.259037       9.85    24.297031   
1           9.7        19.39  11.713109  2.948595       9.71    23.507387   
2           9.6        16.15  16.462819  2.178447       5.49    13.715799   
3           NaN        15.86  17.776569 -0.250480       5.33    10.873079   
4           8.1        15.09  11.955647  1.822994       6.05    18.927450   

  release_date last_update  
0  

In [None]:
# Print the column names in the DataFrame to view the structure of the dataset
print("Columns in the dataset:", df.columns)


Columns in the dataset: Index(['title', 'console', 'genre', 'publisher', 'developer', 'critic_score',
       'total_sales', 'na_sales', 'jp_sales', 'pal_sales', 'other_sales',
       'release_date', 'last_update'],
      dtype='object')


z-score

->The z-score (or standard score) is a statistical measurement that describes how far a data point is from the mean of a dataset in terms of standard deviations. It helps us understand whether a value is above or below the mean and by how much, making it easier to compare data points from different distributions or scales.


In [None]:
# Check if 'global_sales' exists in the dataset before applying z-score
if 'global_sales' in df.columns:


    # Calculate the Z-score for the 'global_sales' column and create a new column for it
    df['global_sales_zscore'] = zscore(df['global_sales'])


    # Filter out rows where the Z-score of 'global_sales' is outside the range of -3 to 3 (outliers)
    df = df[(df['global_sales_zscore'] > -3) & (df['global_sales_zscore'] < 3)]

In [None]:
# Drop the Z-score column after filtering
if 'global_sales' in df.columns:
    df.drop(columns=['global_sales_zscore'], inplace=True)
    print("Outlier handling complete. Here’s a sample of the data:\n", df.head())
else:
    print("Column 'global_sales' not found in the dataset.")


Column 'global_sales' not found in the dataset.


In [None]:
# Check if the 'global_sales' column exists in the DataFrame before dropping the Z-score column
if 'global_sales_zscore' in df.columns:


      # Drop the 'global_sales_zscore' column after outlier filtering is complete
    df.drop(columns=['global_sales_zscore'], inplace=True)


    # Print a message confirming outlier handling is complete and display a sample of the data
    print("Column 'global_sales_zscore' dropped successfully.")
else:


  # If the 'global_sales' column is not found, print a message indicating the issue
    print("Column 'global_sales_zscore' not found in the DataFrame.")


Column 'global_sales_zscore' not found in the DataFrame.


feature engineering
->Feature engineering is the process of creating new features (variables) or modifying existing ones to improve the performance of a machine learning model. It involves transforming raw data into a format that helps a model learn better by capturing relevant patterns and relationships.

In [None]:
# Feature Engineering: Average sales per region
# Creating a list of existing sales columns that are present in the DataFrame
existing_sales_columns = [col for col in ['na_sales', 'eu_sales', 'jp_sales', 'other_sales'] if col in df.columns]

In [None]:
# Check if there are columns to calculate the average
if existing_sales_columns:


  # Calculate the average sales per region across the existing columns
    df['avg_sales_per_region'] = df[existing_sales_columns].mean(axis=1)

    print("Average sales per region calculated successfully.")
else:


  # If none of the specified sales columns are found, print a message
    print("None of the sales columns ('na_sales', 'eu_sales', 'jp_sales', 'other_sales') were found in the DataFrame.")

Average sales per region calculated successfully.


In [None]:
# Final Preprocessed Data
# Display the first few rows of the preprocessed data
print("Preprocessed data:\n",df.head())

Preprocessed data:
                          title console  genre  publisher       developer  \
0           Grand Theft Auto V     PS3      0       2445  Rockstar North   
1           Grand Theft Auto V     PS4      0       2445  Rockstar North   
2  Grand Theft Auto: Vice City     PS2      0       2445  Rockstar North   
3           Grand Theft Auto V    X360      0       2445  Rockstar North   
4    Call of Duty: Black Ops 3     PS4     15        101        Treyarch   

   critic_score  total_sales   na_sales  jp_sales  pal_sales  other_sales  \
0           9.4        20.32  12.339666  5.259037       9.85    24.297031   
1           9.7        19.39  11.713109  2.948595       9.71    23.507387   
2           9.6        16.15  16.462819  2.178447       5.49    13.715799   
3           NaN        15.86  17.776569 -0.250480       5.33    10.873079   
4           8.1        15.09  11.955647  1.822994       6.05    18.927450   

  release_date last_update  avg_sales_per_region  
0   2013-

#DATA WRANGLING
->Data wrangling, also known as data munging, is the process of transforming and cleaning raw data into a usable format for analysis. It involves various techniques to organize, restructure, and prepare data, ensuring that it's suitable for machine learning models or statistical analysis.





In [None]:
# Remove duplicate rows from the DataFrame
df.drop_duplicates(inplace=True)

# 2. Handle missing values
# Fill missing 'Year' values with the median year and 'Publisher' with "Unknown"

In [None]:
# Print the names of all columns in the DataFrame
print(df.columns)


Index(['title', 'console', 'genre', 'publisher', 'developer', 'critic_score',
       'total_sales', 'na_sales', 'jp_sales', 'pal_sales', 'other_sales',
       'release_date', 'last_update', 'avg_sales_per_region'],
      dtype='object')


In [None]:
# Rename the column 'incorrect_name' to 'Year' in the DataFrame
df.rename(columns={'incorrect_name': 'Year'}, inplace=True)


In [None]:
# Handle missing values
# Check if 'Year' column exists and fill missing values
if 'Year' in df.columns:
    df['Year'].fillna(df['Year'].median(), inplace=True)


# Check if 'Publisher' column exists and fill missing values
if 'Publisher' in df.columns:
    df['Publisher'].fillna('Unknown', inplace=True)


In [None]:
# Standardize column names: remove leading/trailing spaces, convert to lowercase, and replace spaces with underscores
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

In [None]:
# Print the updated column names after standardization
print(df.columns)


Index(['title', 'console', 'genre', 'publisher', 'developer', 'critic_score',
       'total_sales', 'na_sales', 'jp_sales', 'pal_sales', 'other_sales',
       'release_date', 'last_update', 'avg_sales_per_region'],
      dtype='object')


In [None]:
# Rename the column 'incorrect_column_name' to 'eu_sales' in the DataFrame

df.rename(columns={'incorrect_column_name': 'eu_sales'}, inplace=True)


In [None]:
# Check if the columns 'eu_sales' and 'global_sales' exist
if 'eu_sales' in df.columns and 'global_sales' in df.columns:


  # Group the data by 'genre' and calculate the sum of sales for each region and 'global_sales'
    genre_sales = df.groupby('genre')[['na_sales', 'eu_sales', 'jp_sales', 'other_sales', 'global_sales']].sum()


    # Merge the calculated genre sales with the original DataFrame on the 'genre' column
    df = df.merge(genre_sales, on='genre', suffixes=('', '_total'))
else:


  # If the required columns are missing, print a message
    print("The required columns are missing from the DataFrame.")


The required columns are missing from the DataFrame.


In [None]:
# Print the first few rows of the wrangled dataset to check the changes
print("Wrangled data:\n",df.head())

Wrangled data:
                          title console  genre  publisher       developer  \
0           Grand Theft Auto V     PS3      0       2445  Rockstar North   
1           Grand Theft Auto V     PS4      0       2445  Rockstar North   
2  Grand Theft Auto: Vice City     PS2      0       2445  Rockstar North   
3           Grand Theft Auto V    X360      0       2445  Rockstar North   
4    Call of Duty: Black Ops 3     PS4     15        101        Treyarch   

   critic_score  total_sales   na_sales  jp_sales  pal_sales  other_sales  \
0           9.4        20.32  12.339666  5.259037       9.85    24.297031   
1           9.7        19.39  11.713109  2.948595       9.71    23.507387   
2           9.6        16.15  16.462819  2.178447       5.49    13.715799   
3           NaN        15.86  17.776569 -0.250480       5.33    10.873079   
4           8.1        15.09  11.955647  1.822994       6.05    18.927450   

  release_date last_update  avg_sales_per_region  
0   2013-09-1

#EDA (EXPLORATORY DATA ANALYSIS)
->Exploratory Data Analysis (EDA) is the process of analyzing and visualizing a dataset to uncover its underlying structure, patterns, relationships, and key characteristics. The goal of EDA is to gain insights that will guide further data preprocessing, feature selection, and modeling steps. EDA is essential because it helps to identify trends, detect outliers, and assess assumptions about the data, ultimately aiding in better decision-making and preparation for predictive modeling.

#1.WHICH TITLES SOLD THE WORLDWIDE?

In [None]:
# Group the data by 'title' and calculate the sum of 'total_sales' for each title, then reset the index
sales_by_title = df.groupby('title')['total_sales'].sum().reset_index()

In [None]:
# Sort the 'sales_by_title' DataFrame by 'total_sales' in descending order
sales_by_title_sort = sales_by_title.sort_values(by = 'total_sales',ascending=False)

In [None]:
# Create a bar chart displaying the top 10 titles by worldwide sales
px.bar(sales_by_title_sort.head(10), x = 'title', y = 'total_sales',
      title = 'Top 10 titles by Worldwide sales')

#2.WHICH YEAR HAD THE HIGHEST SALES?HAS THE INDUSTRY GROWN OVER TIME?

The titles 'Grand theft auto 5','call of Duty Black ops' and 'call of Duty : Modern Warfare' had  the highest sales worldwide

In [None]:
# Extract the year from the 'release_date' column and create a new column 'release_year'
df['release_year'] = pd.to_datetime(df['release_date']).dt.year

In [None]:
# Display the first few rows of the DataFrame to check the data
df.head()

Unnamed: 0,title,console,genre,publisher,developer,critic_score,total_sales,na_sales,jp_sales,pal_sales,other_sales,release_date,last_update,avg_sales_per_region,release_year
0,Grand Theft Auto V,PS3,0,2445,Rockstar North,9.4,20.32,12.339666,5.259037,9.85,24.297031,2013-09-17,,13.965245,2013.0
1,Grand Theft Auto V,PS4,0,2445,Rockstar North,9.7,19.39,11.713109,2.948595,9.71,23.507387,2014-11-18,2018-01-03,12.72303,2014.0
2,Grand Theft Auto: Vice City,PS2,0,2445,Rockstar North,9.6,16.15,16.462819,2.178447,5.49,13.715799,2002-10-28,,10.785688,2002.0
3,Grand Theft Auto V,X360,0,2445,Rockstar North,,15.86,17.776569,-0.25048,5.33,10.873079,2013-09-17,,9.466389,2013.0
4,Call of Duty: Black Ops 3,PS4,15,101,Treyarch,8.1,15.09,11.955647,1.822994,6.05,18.92745,2015-11-06,2018-01-14,10.902031,2015.0


In [None]:
# Group the data by 'release_year' and calculate the sum of 'total_sales' for each year
sales_by_year = df.groupby('release_year')['total_sales'].sum().reset_index()


# Sort the 'sales_by_year' DataFrame by 'total_sales' in descending order
sales_by_year_sort = sales_by_year.sort_values(by = 'total_sales', ascending=False)

In [None]:
# Now you can create the bar plot
# Create a bar chart displaying the top 10 years by worldwide sales
px.bar(sales_by_year_sort.head(10), x='release_year', y='total_sales', title='Top 10 Years by Worldwide Sales')

Conclusion : The year 2008 has made the hear 2009 highest sales worldwide followed by the year 2009

In [None]:
# Create a line chart displaying the global sales trend over the years
px.line(sales_by_year, x = 'release_year', y = 'total_sales', title = 'Global Sales Trend over years')

Conclusion : The Industry grown around 2008 but currently the progress is constant

#3.DO Any Consoles Seem to Specialize in a Particular Genre?

In [None]:
# Group the data by 'console' and 'genre' and calculate the sum of 'total_sales' for each combination
console_genre_sales = df.groupby(['console', 'genre'])['total_sales'].sum().reset_index()

# Sort the 'console_genre_sales' DataFrame by 'total_sales' in descending order

console_genre_sales_sort = console_genre_sales.sort_values(by = 'total_sales', ascending = False)

In [None]:
# Create a sunburst chart to visualize console specialization in different genres
px.sunburst(console_genre_sales_sort, path = ['console', 'genre'], values = 'total_sales',
            title = 'Console specialization in genre')

conclusion : PC console do specialize in genre 'Adventure', 'Strategy' and 'Misc'

#4.What titles are popular in one region but flop in another?

In [None]:
# Display the first few rows of the DataFrame to inspect the data
df.head()

Unnamed: 0,title,console,genre,publisher,developer,critic_score,total_sales,na_sales,jp_sales,pal_sales,other_sales,release_date,last_update,avg_sales_per_region,release_year
0,Grand Theft Auto V,PS3,0,2445,Rockstar North,9.4,20.32,12.339666,5.259037,9.85,24.297031,2013-09-17,,13.965245,2013.0
1,Grand Theft Auto V,PS4,0,2445,Rockstar North,9.7,19.39,11.713109,2.948595,9.71,23.507387,2014-11-18,2018-01-03,12.72303,2014.0
2,Grand Theft Auto: Vice City,PS2,0,2445,Rockstar North,9.6,16.15,16.462819,2.178447,5.49,13.715799,2002-10-28,,10.785688,2002.0
3,Grand Theft Auto V,X360,0,2445,Rockstar North,,15.86,17.776569,-0.25048,5.33,10.873079,2013-09-17,,9.466389,2013.0
4,Call of Duty: Black Ops 3,PS4,15,101,Treyarch,8.1,15.09,11.955647,1.822994,6.05,18.92745,2015-11-06,2018-01-14,10.902031,2015.0




1.   na - North America
2.   jp - Japan
3.   PAL - Phase Alternating Line
     (includes the regions like Europe, Austrailia, New Zealand and some other countries)







In [None]:
# Calculate the ratio of NA sales to total sales and create a new column 'na_ratio'
df['na_ratio'] = df['na_sales'] / df['total_sales']


# Calculate the ratio of JP sales to total sales and create a new column 'jp_ratio'
df['jp_ratio'] = df['jp_sales'] / df['total_sales']


# Calculate the ratio of PAL sales to total sales and create a new column 'pal_ratio'
df['pal_ratio'] = df['pal_sales'] / df['total_sales']

In [None]:
# Display the first three rows of the DataFrame to inspect the data
df.head(3)

Unnamed: 0,title,console,genre,publisher,developer,critic_score,total_sales,na_sales,jp_sales,pal_sales,other_sales,release_date,last_update,avg_sales_per_region,release_year,na_ratio,jp_ratio,pal_ratio
0,Grand Theft Auto V,PS3,0,2445,Rockstar North,9.4,20.32,12.339666,5.259037,9.85,24.297031,2013-09-17,,13.965245,2013.0,0.607267,0.258811,0.484744
1,Grand Theft Auto V,PS4,0,2445,Rockstar North,9.7,19.39,11.713109,2.948595,9.71,23.507387,2014-11-18,2018-01-03,12.72303,2014.0,0.60408,0.152068,0.500774
2,Grand Theft Auto: Vice City,PS2,0,2445,Rockstar North,9.6,16.15,16.462819,2.178447,5.49,13.715799,2002-10-28,,10.785688,2002.0,1.01937,0.134888,0.339938


Titles That are Popular in NA but flop in JP and Pal regions

In [None]:

#Filter the data to find games that have more than 80% of sales in North America (NA)
# and less than 20% of sales in Japan (JP) and the PAL region
na_popular = df[(df.na_ratio > 0.8) & (df.jp_ratio < 0.2) & (df.pal_ratio < 0.2)]

In [None]:
# Create a bar chart displaying the top 5 most popular titles in NA, but with low sales in JP and PAL regions
px.bar(na_popular.head(5), x = 'title', y = [ 'na_sales', 'jp_sales', 'pal_sales'],
       title = '5 most Popular titles in NA but flop in JP and PAL')

conclusion : the titles 'Madden NFL 2204' 'Madden NFL 06' and 'Madden NFL 2005' are popular in NA BUT FLOP JP and PAL

TITLES THAT ARE POPULAR IN JP BUT FLOP IN NA AND PAL REGIONS

In [None]:
# Filter the data to find games that have more than 80% of sales in Japan (JP)
# and less than 20% of sales in North America (NA) and the PAL region
jp_popular = df[(df.jp_ratio > 0.8) & (df.na_ratio < 0.2) & (df.pal_ratio < 0.2)]

In [None]:
# Create a bar chart displaying the top 5 most popular titles in Japan, but with low sales in NA and PAL regions

px.bar(jp_popular.head(5), x = 'title', y = [ 'na_sales', 'jp_sales', 'pal_sales'],
       title = '5 most Popular titles in JP but flop in NA and PAL')

Conclusion : the title 'Hot Shots Golf','rBI Bseball'and 'Famista 89' are popular in JP but flop in Na and PAN

TITLES THAT ARE POPULAR IN PAL BUT FLOP IN NA AND JP REGIONS

In [None]:
# Filter the data to find games that have more than 80% of sales in the PAL region
# and less than 20% of sales in North America (NA) and Japan (JP)
pal_popular = df[(df.pal_ratio > 0.8) & (df.na_ratio < 0.2) & (df.jp_ratio < 0.2)]

In [None]:
# Create a bar chart displaying the top 5 most popular titles in the PAL region, but with low sales in NA and JP regions
px.bar(pal_popular.head(5), x = 'title', y = [ 'na_sales', 'jp_sales', 'pal_sales'],
       title = '5 most Popular titles in PAL but flop in NA and JP')

CONCLUSION : THE TITLE 'THE 'THE SIMS 3', ' COLIN MCRAE RALLY' AND 'ANNO 2070' ARE VERY POPULAR IN PAL REGIONS BUT FLOP IN NA AND JP