# Conflicts factorization

In this notebook, we will try to find hidden informations in our dataset. We want to find meaningful information among the conflicts in the world through years.

This dataset will use Matrix Factorization to enhance our latent space.

In [1]:
import requests
import json
import pandas as pd
import scipy as sc
import numpy as np
import matplotlib.pyplot as plt
import sys
from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error
from sklearn.manifold import TSNE

## Data Handling

> TO DO LIST

- Drop all non-necessary columns. 
- Create vector containing interesting countries
- Separate dates

In [8]:
colonies_df = pd.read_csv('datasets/colonies_wikipedia.csv')
conflicts_df = pd.read_csv('datasets/clean_conflict.csv')

# Separate Datetime
conflicts_df['time'] = pd.to_datetime(conflicts_df['ependdate'])   
conflicts_df['year'] = conflicts_df.time.dt.year
conflicts_df['month'] = conflicts_df.time.dt.month
conflicts_df['day'] = conflicts_df.time.dt.day

# Drop non-necessary columns
conflicts_df.drop([list(conflicts_df)[0], 'conflictid','sidebid','incomp', 'cumint'\
                    ,'type','ISO2', 'ependdate','day','time'], axis = 1, inplace = True)
colonies_df.drop(['ID','Day','URL'],axis = 1, inplace = True)


colonies_df.dropna(0)
conflicts_df.dropna(0)

Unnamed: 0,location,sidea,side b,terr,year,intensity,start_month,region,month
0,Bolivia,Bolivia,Popular Revolutionary Movement,Bolivia,1946.0,2,7,5,7.0
1,Bolivia,Bolivia,MNR,Bolivia,1952.0,1,7,5,4.0
2,Bolivia,Bolivia,ELN,Bolivia,1967.0,1,7,5,10.0
10,Cambodia,France,Khmer Issarak,Cambodia,1953.0,1,8,3,11.0
14,China,China,PLA,China,1949.0,2,12,3,12.0
18,Greece,Greece,DSE,Greece,1949.0,2,3,1,10.0
22,Indonesia,Netherlands,Indonesian People's Army,Indonesia,1949.0,1,10,3,8.0
23,Iran,Iran,KDPI,Kurdistan,1946.0,1,5,2,12.0
26,Iran,Iran,KDPI,Kurdistan,1968.0,1,5,2,12.0
36,Iran,Iran,KDPI,Kurdistan,1988.0,1,5,2,12.0


In [3]:
colonies_df['Colonized Country'].replace('Republic of Tunisia','Tunisia')
colonies_df['Colonized Country'].replace('Congo','Republic of the Congo')
colonies_df['Colonized Country'].replace('DR Congo','Congo')
colonies_df['Colonized Country'].replace('Independent State of Papua New Guinea','New Guinea')
colonies_df.head()

Unnamed: 0.1,Unnamed: 0,Colonized Country,Month,Year,Colonizer Country
0,1,Iceland,6,1944,Denmark
1,2,Canada,7,1867,United Kingdom
2,3,United States,7,1776,France
3,4,Haiti,1,1804,France
4,5,Benin,8,1960,France


In [4]:
colonisation_countries = colonies_df['Colonized Country'].values
colonizers = ['France', 'United Kingdom', 'Denmark', 'Italy', 'Belgium', 'Spain', 'Portugal', 'Russia']
colonisation_countries = np.concatenate([colonisation_countries, colonizers])

## Matrix Creation

In [31]:
min_year = 1946
max_year = 2017
matrix = np.zeros((colonisation_countries.shape[0],max_year - min_year))


for i in range(matrix.shape[0]):
    for j in range(matrix.shape[1]):
        country = colonisation_countries[i]
        date = j +min_year
        inter_df = conflicts_df[conflicts_df['location'].str.contains(country)]
        
        matrix[i][j] = inter_df[inter_df['year'] == date].count().values[1]
        
np.set_printoptions(threshold=100000)    
matrix

  # Remove the CWD from sys.path while we load stuff.


array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  2., ...,  0.,  1.,  1.]])

## Matrix Factorization

In [40]:
colonisation_countries[0]

'Iceland'

## Results & Observations