In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction
The Rubik's Cube is a 3-D combination puzzle invented in 1974 by Hungarian sculptor and professor of architecture Ernő Rubik. Originally called the Magic Cube, the puzzle was licensed by Rubik to be sold by Ideal Toy Corp. in 1980 via businessman Tibor Laczi and Seven Towns founder Tom Kremer.

I personally started solving Rubik's cube when I was a high school student. Solving a Rubik's cube has always been very satisfying. I also love to watch people solving Rubik's cube in certain competitions. The way the solve the cube within seconds is just amazing!

<div style="width:100%;text-align: center;"> <img align=middle src="https://www.picgifs.com/graphics/r/rubiks-cube/animaatjes-rubiks-cube-7217468.gif" alt="Heat beating" style="height:300px;margin-top:3rem;"> </div>

In this analysis, we will mainly do the analysis  for the **Best Singles** and the **Best Averages**. If needed, we will do the analysis of the **Competitions**.

# 1. Best Singles
In this part we will do the necessary analysis for the best singles.

## Data Preparation:
We will prepare our data for analysis and visualization in this section.

In [None]:
df_sing = pd.read_csv('/kaggle/input/evolution-of-rubiks-cube-solve-times/best_singles.csv')

In [None]:
df_sing.head(6)

In [None]:
df_sing.shape

We see that there are total 1000 values.

In [None]:
df_sing.describe()

In [None]:
df_sing.isna().sum()

Seems like no missing values!

We will see the unique countries in the dataset.

In [None]:
print(df_sing['country'].unique())
print(df_sing['country'].nunique())

There are data of total 65 unique countries among the 1000 entries.

Here, we see that there are similar competitions but with different years. So we add another field where there would be competitions without the year. We name this field as `unique competition`.

In [None]:
unique_comp = []
for x in df_sing['competition']:
    l = len(x)
    unique_comp.append(x[2:l-6])
    
df_sing['unique competition'] = unique_comp

Moreover, we will also make a `year` field from the `competition` field to know that exact date of the competitions.

In [None]:
year = []
for x in df_sing['competition']:
    #l = len(x)
    year.append(x[-5:])

#print(year)
df_sing['year'] = year

After adding both the fields, we get a new `df_sing` dataframe.

In [None]:
df_sing.head(10)

Now, we will find the unique values of the following fields:
- `year`
- `unique competition`
- `time`
- `rank`

In [None]:
print(df_sing['year'].unique())
print(df_sing['year'].nunique())

In [None]:
print(df_sing['unique competition'].unique())
print(df_sing['unique competition'].nunique())

In [None]:
print(df_sing['time'].unique())
print(df_sing['time'].nunique())

In [None]:
print(df_sing['rank'].unique())
print(df_sing['rank'].nunique())

In order to easily visualize the data from the dataset, we will create a field called `points`, where each value will be one. We will get the benifit of the column in the next cells.

In [None]:
points = []
for x in range(1000):
    points.append(1)

df_sing['points'] = points

In [None]:
df_sing.head()

Now our `df_sing` dataframe looks quite good. We will now create three seperate dataframes. These are:
- `df_sing_country`
- `df_sing_competition`
- `df_sing_year`

Each of the three dataframes has one particular field as common. This is `Points`. On the right hand field we put all the unique values and in the `Points` field, we add the corresponding point from the main dataframe.

In [None]:
points = []

for x in df_sing['country'].unique():
    points.append(df_sing['country'].tolist().count(x))
#print(points)
unique_country = [list(x) for x in zip(df_sing['country'].unique(), points)]
#print(unique_country)
df_sing_country = pd.DataFrame(unique_country, columns = ['Country', 'Points'])
df_sing_country

In [None]:
points2 = []

for x in df_sing['unique competition'].unique():
    points2.append(df_sing['unique competition'].tolist().count(x))
#print(points)
unique_competitions = [list(x) for x in zip(df_sing['unique competition'].unique(), points2)]
#print(unique_country)
df_sing_competition = pd.DataFrame(unique_competitions, columns = ['Competition', 'Points'])
df_sing_competition

In [None]:
points3 = []

for x in df_sing['year'].unique():
    points3.append(df_sing['year'].tolist().count(x))
#print(points)
year = [list(x) for x in zip(df_sing['year'].unique(), points3)]
#print(unique_country)
df_sing_year = pd.DataFrame(year, columns = ['Year', 'Points'])
df_sing_year

We think we are ready for our analysis and visualization!

## Data Analysis & Visualization
We will do our required analysis and visualizations here.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=True)

### Pie Chart to see the Percentage of Countries in the Leaderboard
To see which countries are present in what percentage in the leaderboard, we plot a pie chart using the `df_sing_country` dataframe.

In [None]:
df_sing_country.loc[df_sing_country['Points'] < 10, 'Country'] = 'Other countries' # Represent only large countries
fig = px.pie(df_sing_country, values='Points', names='Country', title='Country Percentages')
fig.show()

### Pie Chart to see the Yearly Percentages in the Leaderboard
To see year has the most ranks, we plot a pie chart using the `df_sing_year` dataframe.

In [None]:
fig = px.pie(df_sing_year, values='Points', names='Year', title='Yearly Percentages')
fig.show()

### Rank Vs. Time Graph
With this graph we will see how the the solving time changes with the increase of ranking. We have used the `df_sing` dataframe here.

In [None]:
fig, ax = plt.subplots(figsize=(20,8))
ax.plot(df_sing['rank'], df_sing['time'], color='Red')
ax.xaxis.set_major_locator(plt.MaxNLocator(50))
ax.set_xlabel('Rank', fontsize='15')
ax.set_ylabel('Time', fontsize='15')
plt.title('Time Vs. Rank Graph', fontsize='20')
plt.grid()
plt.show()

### Top five: Country, Competition, Year
We will rank the country, competition and year from the three dataframes that we created.

**NOTE:** We have not make visualizations of the `df_sing_competition` as that the point differences are quite less.

In [None]:
df_sing_country.sort_values(by=['Points'], ascending=False). head(5)

In [None]:
df_sing_competition.sort_values(by=['Points'], ascending=False). head(6)

In [None]:
df_sing_year.sort_values(by=['Points'], ascending=False). head(5)

## Summary and Conclusion: Best Singles
From the above analysis we can come to the following **key conclusions**:
1. Solving time changes rapidly between Rank-1 to Rank-25 (3.5 seconds to <5 seconds). The graph is almost straight at that range.
2. We see a rise like $\sqrt{x}$ after Rank-25.
3. 24.2% percentages of the top 1000 cubers are from USA, the comes China (11.5%) and after that comes Poland (4.8%).
4. The golden year for the cubers were 2019 as it holds 41.4% of all the ranks. Then it comes 2018(19.3%), and 2021(11.1%).
5. Although Republic of Korea holds the 6th position with 3.2% of the rankers, Korean Championship holds the 2nd position in among the competitions worldwide after CubingUSA Nationals.

# 2. Best Averages
In this part we will do the necessary analysis for the best averages.

## Data Preparation:
We will prepare our data for analysis and visualization in this section.

In [None]:
df_avg = pd.read_csv('/kaggle/input/evolution-of-rubiks-cube-solve-times/best_averages.csv')

In [None]:
df_avg.head(10)

In [None]:
df_avg.shape

We see that there are also total 1000 values.

In [None]:
df_avg.describe()

In [None]:
df_avg.isna().sum()

No missing values!

In the following steps, we will do the same operations as we did in the best singles.

In [None]:
print(df_avg['country'].unique())
print(df_avg['country'].nunique())

There are data of total 64 unique countries among the 1000 entries. One country less.

In [None]:
unique_comp = []
for x in df_avg['competition']:
    l = len(x)
    unique_comp.append(x[2:l-6])
    
df_avg['unique competition'] = unique_comp

In [None]:
year = []
for x in df_avg['competition']:
    #l = len(x)
    year.append(x[-5:])

#print(year)
df_avg['year'] = year

In [None]:
df_avg.head(10)

In [None]:
print(df_avg['year'].unique())
print(df_avg['year'].nunique())

In [None]:
print(df_avg['unique competition'].unique())
print(df_avg['unique competition'].nunique())

In [None]:
print(df_avg['rank'].unique())
print(df_avg['rank'].nunique())

In [None]:
print(df_avg['time'].unique())
print(df_avg['time'].nunique())

In [None]:
points = []
for x in range(1000):
    points.append(1)

df_avg['points'] = points

In [None]:
df_avg.head()

In [None]:
pointsA = []

for x in df_avg['country'].unique():
    pointsA.append(df_avg['country'].tolist().count(x))
#print(points)
unique_countryA = [list(x) for x in zip(df_avg['country'].unique(), pointsA)]
#print(unique_country)
df_avg_country = pd.DataFrame(unique_countryA, columns = ['Country', 'Points'])
df_avg_country

In [None]:
pointsB = []

for x in df_avg['unique competition'].unique():
    pointsB.append(df_avg['unique competition'].tolist().count(x))
#print(points)
unique_competitionsB = [list(x) for x in zip(df_avg['unique competition'].unique(), pointsB)]
#print(unique_country)
df_avg_competition = pd.DataFrame(unique_competitionsB, columns = ['Competition', 'Points'])
df_avg_competition

In [None]:
pointsC = []

for x in df_avg['year'].unique():
    pointsC.append(df_avg['year'].tolist().count(x))
#print(points)
yearC = [list(x) for x in zip(df_avg['year'].unique(), pointsC)]
#print(unique_country)
df_avg_year = pd.DataFrame(yearC, columns = ['Year', 'Points'])
df_avg_year

The above steps are just as same as we did during preparing and cleaning the dataset for best singles.

Let's do some visualization!

## Data Analysis & Visualization

We will do our required analysis and visualizations here.

### Rank Vs. Time Graph

With this graph we will see how the the solving time changes with the increase of ranking. We have used the `df_avg` dataframe here.

In [None]:
fig, ax = plt.subplots(figsize=(20,8))
ax.plot(df_avg['rank'], df_avg['time'], color='orange')
ax.xaxis.set_major_locator(plt.MaxNLocator(50))
ax.set_xlabel('Rank', fontsize='15')
ax.set_ylabel('Time', fontsize='15')
plt.title('Time Vs. Rank Graph', fontsize='20')
plt.grid()
plt.show()

### Pie Chart to see the Percentage of Countries in the Leaderboard

To see which countries are present in what percentage in the leaderboard, we plot a pie chart using the `df_avg_country` dataframe.


In [None]:
df_avg_country.loc[df_avg_country['Points'] < 10, 'Country'] = 'Other countries' # Represent only large countries
fig = px.pie(df_avg_country, values='Points', names='Country', title='Country Percentages')
fig.show()

### Pie Chart to see the Yearly Percentages in the Leaderboard

To see year has the most ranks, we plot a pie chart using the `df_avg_year dataframe`.


In [None]:
fig = px.pie(df_avg_year, values='Points', names='Year', title='Yearly Percentages')
fig.show()

### Top five: Country, Competition, Year

We will rank the country, competition and year from the three dataframes that we created.

**NOTE:** We have not make visualizations of the `df_avg_competition` as that the point differences are quite less.


In [None]:
df_avg_country.sort_values(by=['Points'], ascending=False).head(5)

In [None]:
df_avg_competition.sort_values(by=['Points'], ascending=False).head(26)

In [None]:
df_avg_year.sort_values(by=['Points'], ascending=False).head(5)

## Summary and Conclusion: Best Averages
From the above analysis we can come to the following **key conclusions**:
1. Solving time changes in the highest pace between Rank-1 to Rank-25 (5.5 seconds to <6.5 seconds). The graph is almost straight at that range.
2. Thoughout the graph there's a rise like that of $\sqrt{x}$.
3. 23% percentages of the top 1000 cubers (avg.) are from USA, the comes China (12%) and after that comes Poland (4.1%).
4. The golden year for the cubers for best averages was 2019 as it holds 41.7% of all the ranks. Then it comes 2021(18.6%), and 2020(14.8%)
5. Top competitions in this part is held in mostly China (top), Korea and then USA (See the sorted values of `df_avg_competition`)

# 3. All Competitions
Here we will do the necessary analysis for all the competition datasets.

## Data Preparation & Cleaning
Here we have cleaned our data and modified it for our use.

In [None]:
df_comp = pd.read_csv('../input/evolution-of-rubiks-cube-solve-times/all_comp.csv')

In [None]:
df_comp.head(10)

We dopped the `Unnamed: 0` column.

In [None]:
df_comp = df_comp.drop(columns=['Unnamed: 0'])

In [None]:
df_comp.head(10)

In [None]:
df_comp.tail()

In [None]:
df_comp.shape

In [None]:
points_final = []
for x in range(7094):
    points_final.append(1)
df_comp['points'] = points_final

In [None]:
df_comp.head(10)

In [None]:
comp_year = []
for x in df_comp['date']:
    comp_year.append(x[-4:])

df_comp['year'] = comp_year
#print(comp_year)

In [None]:
df_comp.head()

In [None]:
print(df_comp['year'].unique())
print(df_comp['year'].nunique())

## Data Analysis and Visualization
There's nothing much to do here for now. I'll make more analysis upon your feedback.

### Pie Chart to see the Yearly percentage of Competition
We see in this pie chart in which year how many competitions were held and its percentages.

In [None]:
fig = px.pie(df_comp, values='points', names='year', title='Yearly Percentage of Competition')
fig.show()

## Summary & Conclusion: All Competitions
Here we see that the highest numbers of competitons were held in the year 2019 (18.7%) then comes 2018 (16.3%), 2017 (13%) and so on. 2021 places 12th with 1.69%. Its because of Covid-19 pandemic.

Thank you so much everyone if you have read so far! For now it's  all I have analysed. I tried my best to keep the notebook as descriptive as possible. If a get any scope in the future, I will work on this for sure.

<div style="width:100%;text-align: center;"> <img align=middle src="https://images-wixmp-ed30a86b8c4ca887773594c2.wixmp.com/f/98dc23a3-cbd4-4aee-b0b8-30bcb510541b/d60l0qa-8b81ff18-30c3-47cd-b67c-9f9a2e011231.gif?token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOiJ1cm46YXBwOjdlMGQxODg5ODIyNjQzNzNhNWYwZDQxNWVhMGQyNmUwIiwiaXNzIjoidXJuOmFwcDo3ZTBkMTg4OTgyMjY0MzczYTVmMGQ0MTVlYTBkMjZlMCIsIm9iaiI6W1t7InBhdGgiOiJcL2ZcLzk4ZGMyM2EzLWNiZDQtNGFlZS1iMGI4LTMwYmNiNTEwNTQxYlwvZDYwbDBxYS04YjgxZmYxOC0zMGMzLTQ3Y2QtYjY3Yy05ZjlhMmUwMTEyMzEuZ2lmIn1dXSwiYXVkIjpbInVybjpzZXJ2aWNlOmZpbGUuZG93bmxvYWQiXX0.h5eUT6l0LMkVIM8iL_9LFqbwvkuQrtqHlKKlPRz84H4" alt="Heat beating" style="height:300px;margin-top:3rem;"> </div>