# Part 3

In [28]:
import pandas as pd
import numpy as np

## Loading data from troop_movements_1m.csv

In [29]:
df = pd.read_csv("troop_movements_1m.csv")
df.head()

Unnamed: 0,timestamp,unit_id,unit_type,location_x,location_y,destination_x,destination_y,homeworld
0,2023-06-03 03:19:15,919214,tie_silencer,2.0,5.0,9,3,Aleen Minor
1,2023-02-24 13:50:40,9467154,stormtrooper,9.0,0.0,9,1,Malastare
2,2023-03-29 19:54:55,6585778,tie_silencer,0.0,6.0,5,9,Serenno
3,2023-04-30 00:58:11,3878023,tie_silencer,4.0,2.0,9,9,Tund
4,2023-04-10 22:00:26,5537117,at-st,6.0,8.0,5,8,Skako


# Install pyarrow and fastparquet

In [30]:
! pip install pyarrow
! pip install fastparquet

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [31]:
df.describe()

Unnamed: 0,unit_id,location_x,location_y,destination_x,destination_y
count,1000000.0,999958.0,999958.0,1000000.0,1000000.0
mean,4999333.0,4.501795,4.498246,4.503209,4.498748
std,2887215.0,2.872456,2.871985,2.870169,2.873131
min,3.0,0.0,0.0,0.0,0.0
25%,2497872.0,2.0,2.0,2.0,2.0
50%,4999172.0,5.0,4.0,5.0,4.0
75%,7501634.0,7.0,7.0,7.0,7.0
max,9999990.0,9.0,9.0,9.0,9.0


## Data Wrangling
```
Replacing the values of "invalid_unit" in unit_type column with "unknown" and replacing missing values in location_X and location_Y columns using ffill method.
```
```
ffill() (short for forward fill) is a method used to fill missing values (NaNs) by carrying forward the last known non-null value.
```

In [32]:
new_df = df.replace('invalid_unit', 'unknown')
new_df[new_df['unit_type'] == 'unknown']

Unnamed: 0,timestamp,unit_id,unit_type,location_x,location_y,destination_x,destination_y,homeworld
1893,2023-03-24 14:48:56,7038000,unknown,7.0,7.0,7,3,Iktotch
3196,2023-02-27 02:28:11,9248000,unknown,9.0,1.0,4,9,Kashyyyk
4065,2023-06-11 06:54:55,212000,unknown,0.0,6.0,1,5,Troiken
4127,2023-05-29 13:02:53,1318000,unknown,6.0,0.0,5,8,Ryloth
4697,2023-06-10 07:57:42,295000,unknown,3.0,1.0,5,5,Dathomir
...,...,...,...,...,...,...,...,...
992899,2023-03-07 09:55:37,8527000,unknown,5.0,1.0,5,8,Skako
994926,2023-05-13 21:12:45,2677000,unknown,7.0,1.0,4,8,Iridonia
999312,2023-02-19 10:13:43,9914000,unknown,1.0,8.0,4,9,Concord Dawn
999350,2023-05-09 01:34:50,3095000,unknown,0.0,3.0,3,0,Ryloth


In [35]:
new_df['location_x'] = new_df["location_x"].ffill()
new_df['location_y'] = new_df["location_y"].ffill()

## Saving Data in Parquet format

In [36]:
new_df.to_parquet("troop_movements_1m.parquet")

## Loading the Model using pickle

In [37]:
import pickle

with open('trained_model.plk', 'rb') as file:
    model = pickle.load(file)

## Reading 1m troops data from troop_movements_1m.parquet file

In [38]:
data = pd.read_parquet("troop_movements_1m.parquet")

#### Used pd.get_dummies() to convert categorical variables into binary (0/1) indicator variables — also known as one-hot encoding.

In [39]:
test_data = data[["homeworld", "unit_type"]]
test_data_encoded = pd.get_dummies(test_data, columns=test_data.columns)
test_data_encoded.head()

Unnamed: 0,homeworld_Alderaan,homeworld_Aleen Minor,homeworld_Bestine IV,homeworld_Cerea,homeworld_Champala,homeworld_Chandrila,homeworld_Concord Dawn,homeworld_Corellia,homeworld_Dagobah,homeworld_Dathomir,...,homeworld_Vulpter,homeworld_Zolan,unit_type_at-at,unit_type_at-st,unit_type_resistance_soldier,unit_type_stormtrooper,unit_type_tie_fighter,unit_type_tie_silencer,unit_type_unknown,unit_type_x-wing
0,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False


In [42]:
predictions = model.predict(test_data_encoded)
predictions[:10]

array([False,  True, False, False, False, False,  True, False, False,
       False])

In [43]:
data["predictions"] = predictions

In [44]:
data.head()

Unnamed: 0,timestamp,unit_id,unit_type,location_x,location_y,destination_x,destination_y,homeworld,predictions
0,2023-06-03 03:19:15,919214,tie_silencer,2.0,5.0,9,3,Aleen Minor,False
1,2023-02-24 13:50:40,9467154,stormtrooper,9.0,0.0,9,1,Malastare,True
2,2023-03-29 19:54:55,6585778,tie_silencer,0.0,6.0,5,9,Serenno,False
3,2023-04-30 00:58:11,3878023,tie_silencer,4.0,2.0,9,9,Tund,False
4,2023-04-10 22:00:26,5537117,at-st,6.0,8.0,5,8,Skako,False
