In [10]:
import polars as pl
import pandas as pd

def read_csv_data(filename: str) -> pl.DataFrame:
        # read the column names from the file
        with open(filename, 'r') as file:
            for _ in range(5):
                file.readline()
            column_names = file.readline().strip().split(';')

        # read the data into a polars dataframe
        df = pl.read_csv(filename, skip_rows=5, separator=';', skip_rows_after_header=2, encoding="ISO-8859-1", ignore_errors=True)

        return df

def read_csv_data_(filename: str) -> pd.DataFrame:
        # read the column names from the file
        with open(filename, 'r') as file:
            for _ in range(5):
                file.readline()
            column_names = file.readline().strip().split(';')

        # read the data into a polars dataframe
        df_pd = pd.read_csv(filename, skiprows=5, delimiter=';', skipfooter=2, engine='python', encoding="ISO-8859-1")

        return df_pd

In [11]:
print(read_csv_data("DealsAndTIVs-2023-03-11-16_22_41 (1).txt").columns)

['Deal ID', 'Seller', 'Buyer', 'Designation', 'Description', 'Armament category', 'Order date', 'Order date is estimate', 'Numbers delivered', 'Numbers delivered is estimate', 'Delivery year', 'Delivery year is estimate', 'Status', 'SIPRI estimate', 'TIV deal unit', 'TIV delivery values', 'Local production']


In [12]:
print(read_csv_data_("DealsAndTIVs-2023-03-11-16_22_41 (1).txt").columns
          )

cols_pd = read_csv_data_("DealsAndTIVs-2023-03-11-16_22_41 (1).txt").columns
cols_pl = read_csv_data("DealsAndTIVs-2023-03-11-16_22_41 (1).txt").columns

print(cols_pl == cols_pd)

Index(['Deal ID', 'Seller', 'Buyer', 'Designation', 'Description',
       'Armament category', 'Order date', 'Order date is estimate',
       'Numbers delivered', 'Numbers delivered is estimate', 'Delivery year',
       'Delivery year is estimate', 'Status', 'SIPRI estimate',
       'TIV deal unit', 'TIV delivery values', 'Local production'],
      dtype='object')
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True]


In [13]:
import re


def read_csv_data(filename: str) -> pl.DataFrame:
        # read the column names from the file
        # with open(filename, 'r') as file:
        #     for _ in range(5):
        #         file.readline()
        #     column_names = file.readline().strip().split(';')

        # read the data into a polars dataframe
        df = pl.read_csv(filename, skip_rows=5, separator=';', skip_rows_after_header=2, encoding="ISO-8859-1", ignore_errors=True) #errors cause no actual issues

        return df


def replace_unicode_chars(df: pl.DataFrame, col_name: str) -> pl.Series:
    # Get the column to be iterated
    col = df[col_name]
    regex = re.compile(r"\\u(\d+)\?")
    # Iterate through each row in the column
    for i, val in enumerate(col):
        # Check if the value contains a unicode character
        match = regex.search(val)
        if match:
            # Extract the unicode character code from the string
            hex_code = val.split("\\u")[1].split("?")[0]
            # Convert the hex code to an integer and then to its corresponding unicode character
            char = chr(int(hex_code))

            # Replace the original value with the new value containing the unicode character
            col[i] = val.replace(match.group(), char)

    return col


def strip_strings(x: str) -> str:
    if isinstance(x, str):
        return x.strip()
    else:
        return x


def rtf_data_processing(df: pl.DataFrame) -> pl.DataFrame:
    df = df.with_columns([
        pl.when(pl.col('Year(s) Weapon of Order').str.contains(r'\(.*\)'))
        .then(pl.lit("Yes"))
        .otherwise(pl.lit("No"))
        .alias("is estimated year order")
    ])

    df = df.with_columns([
        pl.col('Year(s) Weapon of Order').str.replace_all(r"[()]", "").alias('Year(s) Weapon of Order').cast(pl.Int64)
    ])

    return df


def joined_table(df_rtf, csv_df):
    processed_dF = rtf_data_processing(df_rtf)
    # remove possible spaces for the designations
    csv_df = csv_df.with_columns(csv_df['Designation'].str.strip())

    processed_dF = processed_dF.with_columns(replace_unicode_chars(df_rtf, "No. Designation"))
    processed_dF = processed_dF.with_columns(replace_unicode_chars(df_rtf, "No. Comments"))

    joined_dF = csv_df.join(processed_dF,
                        left_on=['Seller', 'Buyer',
                                    'Designation', 'Order date',
                                    ],
                        right_on=["Supplier", "Recipient",
                                    "No. Designation", 'Year(s) Weapon of Order',
                                    ],
                        how="left")
    joined_dF = joined_dF.select(
        ['Deal ID', 'Seller', 'Buyer', 'Designation', 'Description', 'Armament category', 'Order date',
        'Order date is estimate', 'Numbers delivered', 'Numbers delivered is estimate', 'Delivery year',
        'Delivery year is estimate', 'Status', 'SIPRI estimate', 'TIV deal unit', 'TIV delivery values',
        'Local production', 'No. Comments'])

    return joined_dF

In [14]:
file_name = "DealsAndTIVs-2023-03-11-16_22_41 (1).txt"
file_name_ = "data.txt"
df = read_csv_data(file_name)
df_ = read_csv_data(file_name_)
df_


Deal ID,Seller,Buyer,Designation,Description,Armament category,Order date,Order date is estimate,Numbers delivered,Numbers delivered is estimate,Delivery year,Delivery year is estimate,Status,SIPRI estimate,TIV deal unit,TIV delivery values,Local production
i64,str,str,str,str,str,i64,str,i64,str,i64,str,str,f64,f64,f64,str
5787,"""United Kingdom""","""Jordan""","""Autocrat""","""light aircraft""","""Aircraft""",1950,"""No""",2,"""No""",1950,"""No""","""New""",0.12,0.12,0.24,"""No"""
5989,"""Canada""","""Lebanon""","""DHC-1 Chipmunk""","""trainer aircraft""","""Aircraft""",1949,"""Yes""",3,"""No""",1950,"""No""","""New""",0.12,0.12,0.36,"""No"""
5266,"""United Kingdom""","""India""","""Pig""","""APC/APV""","""Armoured vehicles""",1948,"""Yes""",120,"""Yes""",1950,"""No""","""Second hand""",0.12,0.048,5.76,"""No"""
5330,"""Netherlands""","""Indonesia""","""PBY-5A Catalina""","""maritime patrol aircraft""","""Aircraft""",1950,"""Yes""",8,"""Yes""",1950,"""No""","""Second hand""",3.1,1.24,9.92,"""No"""
5862,"""Soviet Union""","""North Korea""","""BTR-152""","""APC""","""Armoured vehicles""",1949,"""Yes""",15,"""Yes""",1950,"""Yes""","""New""",0.13,0.13,1.95,"""No"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
65380,"""United States""","""Russia""","""vehicle engine""","""vehicle engine""","""Engines""",2018,"""Yes""",25,"""Yes""",2022,"""No""","""New""",0.05,0.05,1.25,"""Yes"""
66303,"""Turkiye""","""Burkina Faso""","""Ejder""","""APC""","""Armoured vehicles""",2022,"""Yes""",4,"""Yes""",2022,"""No""","""New""",0.3,0.3,1.2,"""No"""
66305,"""China""","""Benin""","""CSK-131""","""APC/APV""","""Armoured vehicles""",2022,"""Yes""",6,"""Yes""",2022,"""Yes""","""New""",0.14,0.14,0.84,"""No"""
24935,"""China""","""Nigeria""","""PC 46m""","""patrol craft""","""Ships""",2022,"""Yes""",1,"""No""",2022,"""No""","""New""",7.5,7.5,7.5,"""No"""


In [15]:
df

Deal ID,Seller,Buyer,Designation,Description,Armament category,Order date,Order date is estimate,Numbers delivered,Numbers delivered is estimate,Delivery year,Delivery year is estimate,Status,SIPRI estimate,TIV deal unit,TIV delivery values,Local production
i64,str,str,str,str,str,i64,str,i64,str,i64,str,str,f64,f64,f64,str
36470,"""Norway""","""Poland""","""NSM""","""anti-ship missile""","""Missiles""",2008,"""No""",6,"""Yes""",2011,"""Yes""","""New""",1.0,1.0,6.0,"""No"""
53062,"""Russia""","""Iran""","""BMP-2 turret""","""IFV turret""","""Other""",1999,"""Yes""",10,"""Yes""",2011,"""Yes""","""New""",0.55,0.55,5.5,"""No"""
35553,"""China""","""Iran""","""C-704""","""anti-ship missile""","""Missiles""",2003,"""Yes""",25,"""Yes""",2011,"""Yes""","""New""",0.6,0.6,15.0,"""Yes"""
33703,"""United States""","""Pakistan""","""Perry""","""frigate""","""Ships""",2010,"""No""",1,"""No""",2011,"""No""","""Second hand but modernized""",300.0,198.0,198.0,"""No"""
31710,"""Spain""","""Venezuela""","""POVZEE""","""OPV""","""Ships""",2005,"""No""",3,"""No""",2011,"""No""","""New""",54.75,54.75,164.25,"""No"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
65208,"""China""","""Egypt""","""TL-2""","""anti-ship missile/ASM""","""Missiles""",2016,"""Yes""",50,"""Yes""",2016,"""Yes""","""New""",0.04,0.04,2.0,"""No"""
65208,"""China""","""Egypt""","""TL-2""","""anti-ship missile/ASM""","""Missiles""",2016,"""Yes""",75,"""Yes""",2017,"""Yes""","""New""",0.04,0.04,3.0,"""No"""
65208,"""China""","""Egypt""","""TL-2""","""anti-ship missile/ASM""","""Missiles""",2016,"""Yes""",75,"""Yes""",2018,"""Yes""","""New""",0.04,0.04,3.0,"""No"""
,,,,,,,,,,,,,,,,
