In [52]:
%qtconsole

Our hypothesis is: If two pokemon from different generations' base stats are compared, then the pokemon from the newer generation is statistacally likely to have higher overall base stats.

In [87]:
%%file pokemon-stat-comparison.py
from mrjob.job import MRJob

## Produce a condensed list of unique pokemon until gen 8 and their stats 
class PokemonStatMapping(MRJob):

    ##splits file by line
    def mapper(self, _, line):
        thelist = line.split("\n")
        for x in thelist:
            yield x, 1

    ##reduces data to just id/name and stat values.
    def reducer(self, key, values):
        pokemon = key.split(",")
        pokemonID = pokemon[0] + " - " + pokemon[1] + " - " + pokemon[2]
        statTotal = 0
        lastIndex = len(pokemon) - 1

        # use a for loop to sum all stats of each pokemon
        for i in range(6):
            try:
                statTotal += int(pokemon[lastIndex - i])
            except ValueError:
                pass

        # set value equal to all 6 stats, plus a stat total, comma-seperated
        pokemonStats = pokemon[lastIndex-5] + "," + pokemon[lastIndex-4] + "," + pokemon[lastIndex-3] + "," + pokemon[lastIndex-2] + "," + pokemon[lastIndex-1] + "," + pokemon[lastIndex] + "," + str(statTotal)
        
        if pokemonID[0] == '0':
            yield pokemonID, pokemonStats
        else:
            pass


if __name__ == '__main__':
    PokemonStatMapping.run()

Overwriting pokemon-stat-comparison.py


In [88]:
# execute the script and store the output in a single intermediary file to be used by the next MRjob
# if distributed environment was used, this output would be left as multiple files.

!del Intermediate_data\*.txt
!python pokemon-stat-comparison.py ./Pokemon-Datasets/PreProcessed-Datasets/pokemon_swsh_formatted.csv --output-dir=Intermediate_data 
!copy /b Intermediate_data\part-* Intermediate_data\job1_output.txt
!del Intermediate_data\part-*

No configs found; falling back on auto-configuration
No configs specified for inline runner
Running step 1 of 1...
Creating temp directory C:\Users\dylan\AppData\Local\Temp\pokemon-stat-comparison.dylan.20241209.185416.280964
job output is in Intermediate_data
Removing temp directory C:\Users\dylan\AppData\Local\Temp\pokemon-stat-comparison.dylan.20241209.185416.280964...


Intermediate_data\part-00000
Intermediate_data\part-00001
Intermediate_data\part-00002
Intermediate_data\part-00003
Intermediate_data\part-00004
Intermediate_data\part-00005
Intermediate_data\part-00006
Intermediate_data\part-00007
Intermediate_data\part-00008
Intermediate_data\part-00009
Intermediate_data\part-00010
Intermediate_data\part-00011
Intermediate_data\part-00012
Intermediate_data\part-00013
Intermediate_data\part-00014
Intermediate_data\part-00015
Intermediate_data\part-00016
Intermediate_data\part-00017
Intermediate_data\part-00018
Intermediate_data\part-00019
Intermediate_data\part-00020
Intermediate_data\part-00021
Intermediate_data\part-00022
Intermediate_data\part-00023
        1 file(s) copied.


In [97]:
%%file pokemon-stat-aggregation.py
from mrjob.job import MRJob

class PokemonStatAggregation(MRJob):

    def mapper(self, _, line):

        # Example line: "0001 - Bulbasaur - grass/poison" "45,49,49,65,65,45,318"
        key, value = line.split("\t")

        # remove surrounding quotations from keys and values
        key = key.strip('\"')
        value = value.strip('\"')
        
        # get gen based on pokedex number

        pokemon_id = key.split(" - ")[0]
        pokedex = int(pokemon_id)
        
        if 1 <= pokedex <= 151:
            gen = 1
        elif 152 <= pokedex <= 251:
            gen = 2
        elif 252 <= pokedex <= 386:
            gen = 3
        elif 387 <= pokedex <= 493:
            gen = 4
        elif 494 <= pokedex <= 649:
            gen = 5
        elif 650 <= pokedex <= 721:
            gen = 6
        elif 722 <= pokedex <= 809:
            gen = 7
        elif 810 <= pokedex <= 898:
            gen = 8
        else:
            return
        
        # Get stats from the value (format: "45,49,49,65,65,45,318")
        stats = list(map(int, value.split(",")))
        
        yield gen, stats

    def reducer(self, key, values):

        totalHp = totalAtt = totalDef = totalSatt = totalSdef = totalSpd = totalStatTotal = 0
        count = 0
        
        # aggregate stats for each generation
        for stats in values:
            totalHp += stats[0]
            totalAtt += stats[1]
            totalDef += stats[2]
            totalSatt += stats[3]
            totalSdef += stats[4]
            totalSpd += stats[5]
            totalStatTotal += stats[6]
            count += 1

        # calculate the averages
        avgHp = totalHp / count
        avgAtt = totalAtt / count
        avgDef = totalDef / count
        avgSatt = totalSatt / count
        avgSdef = totalSdef / count
        avgSpd = totalSpd / count
        avgStatTotal = totalStatTotal / count

        # yield the generation and average stats
        yield key, [avgHp, avgAtt, avgDef, avgSatt, avgSdef, avgSpd, avgStatTotal]

if __name__ == '__main__':
    PokemonStatAggregation.run()

Overwriting pokemon-stat-aggregation.py


In [98]:
!python pokemon-stat-aggregation.py ./Intermediate_data/job1_output.txt

1	[64.03932584269663, 73.64044943820225, 68.57865168539325, 67.84831460674157, 66.71910112359551, 70.7752808988764, 411.6011235955056]
2	[70.91428571428571, 68.76190476190476, 70.16190476190476, 65.08571428571429, 72.94285714285714, 61.86666666666667, 409.73333333333335]
3	[65.3913043478261, 72.46376811594203, 68.64492753623189, 67.2536231884058, 66.23188405797102, 62.13768115942029, 402.1231884057971]
4	[79.05185185185185, 85.32592592592593, 83.08888888888889, 82.02962962962962, 82.5111111111111, 77.05925925925926, 489.06666666666666]
5	[72.26712328767124, 84.57534246575342, 73.26027397260275, 72.17808219178082, 68.70547945205479, 67.71232876712328, 438.6986301369863]
6	[71.74603174603175, 79.0, 84.38095238095238, 69.96825396825396, 76.12698412698413, 63.714285714285715, 444.93650793650795]
7	[76.91089108910892, 88.5940594059406, 85.02970297029702, 81.87128712871286, 80.73267326732673, 70.61386138613861, 483.7524752475247]
8	[74.62244897959184, 83.54081632653062, 77.0204081632653, 73.

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory C:\Users\dylan\AppData\Local\Temp\pokemon-stat-aggregation.dylan.20241209.190220.079723
Running step 1 of 1...
job output is in C:\Users\dylan\AppData\Local\Temp\pokemon-stat-aggregation.dylan.20241209.190220.079723\output
Streaming final output from C:\Users\dylan\AppData\Local\Temp\pokemon-stat-aggregation.dylan.20241209.190220.079723\output...
Removing temp directory C:\Users\dylan\AppData\Local\Temp\pokemon-stat-aggregation.dylan.20241209.190220.079723...
