# Data Preparation & Storage

## Load the datasets

In [5]:
import pandas as pd

# Load the datasets
df1 = pd.read_excel("globalterrorismdb_0522dist.xlsx")
df2 = pd.read_excel("globalterrorismdb_2021Jan-June_1222dist.xlsx")

print(df2.head())

        eventid  iyear  imonth  iday  approxdate  extended resolution  \
0  202101010004   2021       1     1         NaN         0        NaT   
1  202101010005   2021       1     1  01/01/2021         0        NaT   
2  202101010006   2021       1     1         NaN         0        NaT   
3  202101010009   2021       1     1         NaN         0        NaT   
4  202101010024   2021       1     2         NaN         0        NaT   

   country  country_txt  region  ...  \
0       60        Egypt      10  ...   
1       92        India       6  ...   
2      228        Yemen      10  ...   
3        4  Afghanistan       6  ...   
4      182      Somalia      11  ...   

                                            addnotes  \
0                                                NaN   
1                                                NaN   
2  There is doubt that this incident meets terror...   
3  There is doubt that this incident meets terror...   
4                                       

In [11]:
print(df1.head())

        eventid  iyear  imonth  iday approxdate  extended resolution  country  \
0  197000000001   1970       7     2        NaN         0        NaT       58   
1  197000000002   1970       0     0        NaN         0        NaT      130   
2  197001000001   1970       1     0        NaN         0        NaT      160   
3  197001000002   1970       1     0        NaN         0        NaT       78   
4  197001000003   1970       1     0        NaN         0        NaT      101   

          country_txt  region  ... addnotes scite1 scite2  scite3  dbsource  \
0  Dominican Republic       2  ...      NaN    NaN    NaN     NaN      PGIS   
1              Mexico       1  ...      NaN    NaN    NaN     NaN      PGIS   
2         Philippines       5  ...      NaN    NaN    NaN     NaN      PGIS   
3              Greece       8  ...      NaN    NaN    NaN     NaN      PGIS   
4               Japan       4  ...      NaN    NaN    NaN     NaN      PGIS   

   INT_LOG  INT_IDEO INT_MISC INT_ANY 

## Convert the datasets

In [7]:
# Save as CSV
df1.to_csv("globalterrorismdb_1970_2020.csv", index=False)
df2.to_csv("globalterrorismdb_2021.csv", index=False)

In [13]:
# Convert the 'approxdate' column to a string
df1['approxdate'] = df1['approxdate'].astype(str)
df2['approxdate'] = df2['approxdate'].astype(str)

# Save as Parquet (recommended for Spark)
df1.to_parquet("globalterrorismdb_1970_2020.parquet", index=False)
df2.to_parquet("globalterrorismdb_2021.parquet", index=False)

# Data Storage Using MongoDB

In [15]:
from pymongo import MongoClient
import pandas as pd

# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["GlobalTerrorism"]
collection = db["GTD"]

# Load CSV
df1 = pd.read_csv("globalterrorismdb_1970_2020.csv")
df2 = pd.read_csv("globalterrorismdb_2021.csv")

# Convert NaN to None for MongoDB compatibility
df1 = df1.where(pd.notna(df1), None)
df2 = df2.where(pd.notna(df2), None)

# Insert into MongoDB
collection.insert_many(df1.to_dict(orient="records"))
collection.insert_many(df2.to_dict(orient="records"))

print("CSV Data inserted successfully!")

  df1 = pd.read_csv("globalterrorismdb_1970_2020.csv")
  df2 = pd.read_csv("globalterrorismdb_2021.csv")


CSV Data inserted successfully!


## Query Data

In [17]:
# Retrieve data
document = collection.find_one()
print(document)

{'_id': ObjectId('67accb675a2fd9c0c2e5e739'), 'eventid': 197000000001, 'iyear': 1970, 'imonth': 7, 'iday': 2, 'approxdate': None, 'extended': 0, 'resolution': None, 'country': 58, 'country_txt': 'Dominican Republic', 'region': 2, 'region_txt': 'Central America & Caribbean', 'provstate': 'National', 'city': 'Santo Domingo', 'latitude': 18.456792, 'longitude': -69.951164, 'specificity': 1.0, 'vicinity': 0, 'location': None, 'summary': None, 'crit1': 1, 'crit2': 1, 'crit3': 1, 'doubtterr': 0, 'alternative': nan, 'alternative_txt': None, 'multiple': 0.0, 'success': 1, 'suicide': 0, 'attacktype1': 1, 'attacktype1_txt': 'Assassination', 'attacktype2': nan, 'attacktype2_txt': None, 'attacktype3': nan, 'attacktype3_txt': None, 'targtype1': 14, 'targtype1_txt': 'Private Citizens & Property', 'targsubtype1': 68.0, 'targsubtype1_txt': 'Named Civilian', 'corp1': None, 'target1': 'Julio Guzman', 'natlty1': 58.0, 'natlty1_txt': 'Dominican Republic', 'targtype2': nan, 'targtype2_txt': None, 'targsubt