In [96]:
import polars as pl
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

In [97]:
df = pl.read_csv('../rtf/joined_data.csv')
print(df.shape)

(60134, 18)


In [98]:


# select arms deals where UK is the buyer
uk_buyer_df = df.filter(pl.col("Buyer") == "United Kingdom")
total_uk_buyer_deals = len(uk_buyer_df)
print("Total UK arms deals as buyer:", total_uk_buyer_deals)

# select arms deals where UK is the seller
uk_seller_df = df.filter(pl.col("Seller") == "United Kingdom")
total_uk_seller_deals = len(uk_seller_df)
print("Total UK arms deals as seller:", total_uk_seller_deals)

Total UK arms deals as buyer: 824
Total UK arms deals as seller: 4215


In [99]:
# read conflicts data

df_conflict = pl.read_csv('../conflicts/ucdp-prio-acd-221.csv', dtypes={
    'side_a_id': pl.Utf8,
    'region': pl.Utf8,
    'gwno_a': pl.Utf8,
})
print(df_conflict.shape)

(2568, 28)



The argument `dtypes` for `read_csv` is deprecated. It has been renamed to `schema_overrides`.



In [100]:
# count occurrences of each value in "side_a" and "side_b" columns
side_a_counts = df_conflict['side_a'].value_counts()
side_b_counts = df_conflict['side_b'].value_counts()

# combine the counts and sort in descending order
# top_participants = (side_a_counts.add(side_b_counts, fill_value=0)
#                           .sort_values(ascending=False))
side_a_counts = side_a_counts.rename({'side_a': 'name'})
side_b_counts = side_b_counts.rename({'side_b': 'name'})

top_participants = pl.concat([side_a_counts, side_b_counts])
top_participants = top_participants.group_by('name').sum()

print(top_participants)

# select the top 10 participants
top_10_participants = top_participants.sort("count", descending=True).head(10)


shape: (724, 2)
┌─────────────────────────────────┬───────┐
│ name                            ┆ count │
│ ---                             ┆ ---   │
│ str                             ┆ u32   │
╞═════════════════════════════════╪═══════╡
│ Résistance Armée Tunisienne     ┆ 1     │
│ Darfur Joint Resistance Forces… ┆ 1     │
│ SSIA                            ┆ 2     │
│ PF, ZANU                        ┆ 1     │
│ Government of Dominican Republ… ┆ 1     │
│ …                               ┆ …     │
│ Government of Uruguay           ┆ 1     │
│ Taleban                         ┆ 10    │
│ NLC                             ┆ 1     │
│ Republic of Armenia             ┆ 2     │
│ KDP-QM, PUK                     ┆ 2     │
└─────────────────────────────────┴───────┘


In [101]:
import plotly.graph_objs as go

# create the bar chart
fig = go.Figure()
fig.add_trace(go.Bar(
    x=top_10_participants['count'],
    y=top_10_participants['name'],
    orientation='h'
))

# customize the chart layout
fig.update_layout(
    title="Top 10 Participants in the Dataset",
    xaxis_title="Count",
    yaxis_title="Participant",
    yaxis_categoryorder="total ascending",
    margin=dict(l=150),
    height=500
)

# display the chart
fig.show()

In [102]:
import plotly.express as px


names = top_10_participants.select('name').to_series().to_list()
vals = top_10_participants.select('count').to_series().to_list()

px.pie(
       names = names,
       values = vals,
       color_discrete_sequence = px.colors.sequential.Plasma_r,
       #title="Top 10 participant for global conflicts"
)

In [103]:
india_df = df.filter(pl.col("Buyer") == "India")

pivoted_india_df = india_df.drop(["Seller", "Buyer","Deal ID", "Designation","Description", "Order date is estimate", "Numbers delivered is estimate", "Delivery year", "Delivery year is estimate", "Status", "SIPRI estimate", "TIV deal unit", "TIV delivery values", "Local production", "No. Comments"])
print(pivoted_india_df)
# # pivot the table on "Order date" and "Armament category", and aggregate the values by summing them up
# pivoted_india_df = pivoted_india_df.pivot(on="Armament category",
#                                           index="Order date",
#                                           values="Numbers delivered").fill_null(0)


# print(pivoted_india_df)

shape: (2_686, 3)
┌───────────────────┬────────────┬───────────────────┐
│ Armament category ┆ Order date ┆ Numbers delivered │
│ ---               ┆ ---        ┆ ---               │
│ str               ┆ i64        ┆ i64               │
╞═══════════════════╪════════════╪═══════════════════╡
│ Armoured vehicles ┆ 1948       ┆ 120               │
│ Aircraft          ┆ 1949       ┆ 4                 │
│ Ships             ┆ 1948       ┆ 3                 │
│ Aircraft          ┆ 1950       ┆ 5                 │
│ Aircraft          ┆ 1950       ┆ 3                 │
│ …                 ┆ …          ┆ …                 │
│ Missiles          ┆ 2017       ┆ 100               │
│ Missiles          ┆ 2012       ┆ 5                 │
│ Armoured vehicles ┆ 2020       ┆ 125               │
│ Engines           ┆ 2016       ┆ 2                 │
│ Engines           ┆ 2016       ┆ 2                 │
└───────────────────┴────────────┴───────────────────┘


In [104]:
india_df_conflict = df_conflict.filter((pl.col("side_a").str.contains("India")) |
                                       (pl.col("side_b").str.contains("India")) |
                                       (pl.col("side_a_2nd").str.contains("India")) |
                                       (pl.col("side_b_2nd").str.contains("India")) |
                                       (pl.col("location").str.contains("India"))
                                       )
# india_df_conflict = india_df_conflict
india_df_conflict = india_df_conflict.group_by("year").agg([pl.col("intensity_level").count()]).sort("year")
india_df_conflict = india_df_conflict.with_columns([pl.col("intensity_level").apply(lambda x : 1 if x <= 1 else 2)])
# india_df_conflict = india_df_conflict.with_columns([pl.col("intensity_level").apply(lambda x : max(x))])
print(india_df_conflict)

AttributeError: 'Expr' object has no attribute 'apply'

In [53]:
# join the dataframes on the year column
joined_india_df = pivoted_india_df.join(india_df_conflict,
                                        left_on=["Order date"], right_on=["year"], how="left").fill_null(0)
print(joined_india_df.columns)

NameError: name 'pivoted_india_df' is not defined

In [54]:
# drop the 'Order date' column
india_dataset = joined_india_df.drop('Order date')
# separate the target variable 'intensity_level'
y = india_dataset['intensity_level']
X = india_dataset.drop('intensity_level')
# standardize the data
scaler = StandardScaler()
X_std = scaler.fit_transform(X.to_pandas())
print(X.shape)
print(X_std.shape)


NameError: name 'joined_india_df' is not defined

In [55]:
# create a Random Forest Classifier object
rfc = RandomForestClassifier()

# perform 5-fold cross-validation and get the mean accuracy score
scores = cross_val_score(rfc, X.to_pandas(), y, cv=5)
mean_score = scores.mean()

print('Mean accuracy score:', mean_score)

NameError: name 'X' is not defined

In [None]:

from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
joined_india_pddf = joined_india_df.to_pandas()

joined_india_pddf['Order date'] = pd.to_datetime(joined_india_pddf['Order date'], format='%Y')
joined_india_pddf.set_index('Order date', inplace=True)

# Separate the target variable 'intensity_level'
y = joined_india_pddf['intensity_level']
X = joined_india_pddf.drop('intensity_level', axis=1)

# Split the data into training and testing sets
train_data_X = X.loc[X.index <= '2010']
train_data_y = y.loc[y.index <= '2010']
test_data_X = X.loc[X.index >= '2011']
test_data_y = y.loc[y.index >= '2011']

# Fit an ARIMA model to the training data
arima_model = ARIMA(train_data_y, order=(1, 1, 1), exog=train_data_X)
arima_fit = arima_model.fit()

# Forecast the values of the time series for the test set
forecast = arima_fit.forecast(steps=len(test_data_X), exog=test_data_X)

# Evaluate the model using RMSE
mse = mean_squared_error(test_data_y, forecast)
rmse = np.sqrt(mse)
print('RMSE:', rmse)