In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("troop_movements10m.csv")
df.head()

Unnamed: 0,timestamp,unit_id,unit_type,location_x,location_y,destination_x,destination_y,homeworld
0,2023-06-13 17:33:18,1,at-st,2.0,8.0,1,1,Glee Anselm
1,2023-06-13 17:33:17,2,tie_silencer,4.0,4.0,0,1,Trandosha
2,2023-06-13 17:33:16,3,at-at,0.0,3.0,6,1,Corellia
3,2023-06-13 17:33:15,4,tie_silencer,6.0,1.0,6,9,Shili
4,2023-06-13 17:33:14,5,tie_fighter,0.0,4.0,9,6,Muunilinst


In [3]:
# replace invalid_unit unit_type records with unknown
#df['unit_type'].value_counts()
df['unit_type'].replace('invalid_unit', 'unknown', inplace=True)
df['unit_type'].value_counts()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['unit_type'].replace('invalid_unit', 'unknown', inplace=True)


unit_type
x-wing                1428412
stormtrooper          1428291
tie_silencer          1428016
tie_fighter           1426809
at-at                 1426525
at-st                 1426173
resistance_soldier    1425774
unknown                 10000
Name: count, dtype: int64

In [4]:
# ffill method with location_x and location_y if missing
# df.isna().sum()
df.ffill(inplace=True)
df.isna().sum()


timestamp        0
unit_id          0
unit_type        0
location_x       0
location_y       0
destination_x    0
destination_y    0
homeworld        0
dtype: int64

In [5]:
# Convert to numeric data
df = pd.get_dummies(df, columns=['homeworld', 'unit_type'])
print(df.head())


             timestamp  unit_id  location_x  location_y  destination_x  \
0  2023-06-13 17:33:18        1         2.0         8.0              1   
1  2023-06-13 17:33:17        2         4.0         4.0              0   
2  2023-06-13 17:33:16        3         0.0         3.0              6   
3  2023-06-13 17:33:15        4         6.0         1.0              6   
4  2023-06-13 17:33:14        5         0.0         4.0              9   

   destination_y  homeworld_Alderaan  homeworld_Aleen Minor  \
0              1               False                  False   
1              1               False                  False   
2              1               False                  False   
3              9               False                  False   
4              6               False                  False   

   homeworld_Bestine IV  homeworld_Cerea  ...  homeworld_Vulpter  \
0                 False            False  ...              False   
1                 False            False

In [6]:
# save the clean data into a parquet file
! pip install pyarrow
! pip install fastparquet

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable
Collecting fastparquet
  Using cached fastparquet-2024.5.0.tar.gz (466 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting cramjam>=2.3 (from fastparquet)
  Using cached cramjam-2.8.3-cp312-none-win_amd64.whl.metadata (4.3 kB)
Collecting fsspec (from fastparquet)
  Using cached fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)
Using cached cramjam-2.8.3-cp312-none-win_amd64.whl (1.6 MB)
Using cached fsspec-2024.6.1-py3-none-any.whl (177 kB)
Building wheels for collected packages: fastparquet
  Building wheel for fastparquet (pyproject.toml): started
  Building wheel for fastparquet (pyproject.toml): finished with status '

  error: subprocess-exited-with-error
  
  × Building wheel for fastparquet (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [140 lines of output]
      fatal: not a git repository (or any of the parent directories): .git
      Traceback (most recent call last):
        File "C:\Users\wasadmin\AppData\Local\Temp\2\pip-build-env-e1pbysy7\overlay\Lib\site-packages\setuptools_scm\_integration\pyproject_reading.py", line 36, in read_pyproject
          section = defn.get("tool", {})[tool_name]
                    ~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^
      KeyError: 'setuptools_scm'
      running bdist_wheel
      running build
      running build_py
      creating build
      creating build\lib.win-amd64-cpython-312
      creating build\lib.win-amd64-cpython-312\fastparquet
      copying fastparquet\api.py -> build\lib.win-amd64-cpython-312\fastparquet
      copying fastparquet\compression.py -> build\lib.win-amd64-cpython-312\fastparquet
      copying fastparquet\converted_typ

In [7]:
filename = "troop_movements10m.parquet"

"""with open(filename, 'wb') as file:
    pickle.dump(model, file)"""

df.to_parquet(filename)

## Loading the Model

In [1]:
import pickle
import pandas as pd

with open('trained_model.pkl', 'rb') as file:
    model = pickle.load(file)

df = pd.read_parquet('troop_movements10m.parquet')


In [2]:
test = df.drop(['timestamp', 'location_x', 'location_y', 'destination_x', 'destination_y', 'unit_id'], axis=1)
test.head()

Unnamed: 0,homeworld_Alderaan,homeworld_Aleen Minor,homeworld_Bestine IV,homeworld_Cerea,homeworld_Champala,homeworld_Chandrila,homeworld_Concord Dawn,homeworld_Corellia,homeworld_Dagobah,homeworld_Dathomir,...,homeworld_Vulpter,homeworld_Zolan,unit_type_at-at,unit_type_at-st,unit_type_resistance_soldier,unit_type_stormtrooper,unit_type_tie_fighter,unit_type_tie_silencer,unit_type_unknown,unit_type_x-wing
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,True,False,False,...,False,False,True,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [4]:
preds = model.predict(test)
preds[:5]

MemoryError: Unable to allocate 1.86 GiB for an array with shape (50, 10000000) and data type float32