In [18]:
from sqlalchemy import Column, Integer, String, ForeignKey, create_engine, DateTime, func, Text, select
from sqlalchemy import Float, Boolean
from sqlalchemy.orm import declarative_base, Session, relationship, sessionmaker, configure_mappers
from datetime import datetime, timezone
from typing import List, Tuple, Dict
import pandas as pd
from IPython.display import display, HTML

In [2]:
import chardet

with open('content/globalterrorismdb_0718dist.csv', 'rb') as f:
    result = chardet.detect(f.read(100000))  # Считываем первые 100000 байт
    print(result)

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}


In [3]:
df_original = pd.read_csv('content/globalterrorismdb_0718dist.csv', encoding='ISO-8859-1', low_memory=False)

In [4]:
df_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181691 entries, 0 to 181690
Columns: 135 entries, eventid to related
dtypes: float64(55), int64(22), object(58)
memory usage: 187.1+ MB


In [5]:
df_original


Unnamed: 0,eventid,iyear,imonth,iday,approxdate,extended,resolution,country,country_txt,region,...,addnotes,scite1,scite2,scite3,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY,related
0,197000000001,1970,7,2,,0,,58,Dominican Republic,2,...,,,,,PGIS,0,0,0,0,
1,197000000002,1970,0,0,,0,,130,Mexico,1,...,,,,,PGIS,0,1,1,1,
2,197001000001,1970,1,0,,0,,160,Philippines,5,...,,,,,PGIS,-9,-9,1,1,
3,197001000002,1970,1,0,,0,,78,Greece,8,...,,,,,PGIS,-9,-9,1,1,
4,197001000003,1970,1,0,,0,,101,Japan,4,...,,,,,PGIS,-9,-9,1,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181686,201712310022,2017,12,31,,0,,182,Somalia,11,...,,"""Somalia: Al-Shabaab Militants Attack Army Che...","""Highlights: Somalia Daily Media Highlights 2 ...","""Highlights: Somalia Daily Media Highlights 1 ...",START Primary Collection,0,0,0,0,
181687,201712310029,2017,12,31,,0,,200,Syria,10,...,,"""Putin's 'victory' in Syria has turned into a ...","""Two Russian soldiers killed at Hmeymim base i...","""Two Russian servicemen killed in Syria mortar...",START Primary Collection,-9,-9,1,1,
181688,201712310030,2017,12,31,,0,,160,Philippines,5,...,,"""Maguindanao clashes trap tribe members,"" Phil...",,,START Primary Collection,0,0,0,0,
181689,201712310031,2017,12,31,,0,,92,India,6,...,,"""Trader escapes grenade attack in Imphal,"" Bus...",,,START Primary Collection,-9,-9,0,-9,


In [6]:
# Проверим на уникальные данные и систематизируем их

from typing import List

df = df_original.copy()

for col in df.columns:
    unique_count: int = df[col].nunique()  # Количество уникальных значений для каждого столбца
    unique_values: List = df[col].unique().tolist()  # Список уникальных значений для каждого столбца
    
    # Проверяем, все ли значения можно преобразовать в числа (чтобы отсеять числовые поля)
    is_numeric: bool = pd.to_numeric(df[col], errors='coerce').notnull().all()
    
    if is_numeric:
        print(f"Столбец '{col}': уникальных значений {unique_count} — цифровые значения")
    else:
        print(f"Столбец '{col}': уникальных значений {unique_count} — {unique_values[:5]}")


Столбец 'eventid': уникальных значений 181691 — цифровые значения
Столбец 'iyear': уникальных значений 47 — цифровые значения
Столбец 'imonth': уникальных значений 13 — цифровые значения
Столбец 'iday': уникальных значений 32 — цифровые значения
Столбец 'approxdate': уникальных значений 2244 — [nan, 'January 19-20, 1970', 'February 6-9, 1970', 'February 11-14, 1970', 'February 16-17, 1970']
Столбец 'extended': уникальных значений 2 — цифровые значения
Столбец 'resolution': уникальных значений 1859 — [nan, '3/8/1970', '3/15/1970', '3/31/1970', '3/26/1970']
Столбец 'country': уникальных значений 205 — цифровые значения
Столбец 'country_txt': уникальных значений 205 — ['Dominican Republic', 'Mexico', 'Philippines', 'Greece', 'Japan']
Столбец 'region': уникальных значений 12 — цифровые значения
Столбец 'region_txt': уникальных значений 12 — ['Central America & Caribbean', 'North America', 'Southeast Asia', 'Western Europe', 'East Asia']
Столбец 'provstate': уникальных значений 2855 — [nan,

In [27]:
Base = declarative_base()

class Incident(Base):
    __tablename__ = 'incident'
    
    id = Column(Integer, primary_key=True)  # eventid
    iyear = Column(Integer, nullable=False)
    imonth = Column(Integer, nullable=False)
    iday = Column(Integer, nullable=False)
    city_id = Column(Integer, ForeignKey('city.id'))
    place_spec_id = Column(Integer, ForeignKey('place_spec.id'))
    attack_spec_id = Column(Integer, ForeignKey('attack_spec.id'))
    type_incident_id = Column(Integer, ForeignKey('type_incident.id'))
    type_attack_id = Column(Integer, ForeignKey('type_attack.id'))
    target_sub_type_id = Column(Integer, ForeignKey('target_sub_type.id'))
    corp_id = Column(Integer, ForeignKey('corp.id'))
    target_id = Column(Integer, ForeignKey('target.id'))
    natlty_id = Column(Integer, ForeignKey('natlty.id'))
    claimmode_id = Column(Integer, ForeignKey('claimmode.id'))
    weapon_sub_type_id = Column(Integer, ForeignKey('weapon_sub_type.id')) 
    propextent_id = Column(Integer, ForeignKey('propextent.id'))

    city = relationship(lambda:City, back_populates="incident")
    place_spec = relationship(lambda:PlaceSpec, back_populates="incident")
    attack_spec = relationship(lambda:AttackSpec, back_populates="incident")
    type_incident = relationship(lambda:Type_Incident, back_populates="incident")
    type_attack = relationship(lambda:Type_Attack, back_populates="incident")
    target_sub_type = relationship(lambda:Target_Sub_Type, back_populates="incident")
    corp = relationship(lambda:Corp, back_populates="incident")
    target = relationship(lambda:Target, back_populates="incident")
    natlty = relationship(lambda:Natlty, back_populates="incident")
    claimmode = relationship(lambda:Claimmode, back_populates="incident") 
    weapon_sub_type = relationship(lambda:Weapon_Sub_Type, back_populates="incident") 
    propextent = relationship(lambda:Propextent, back_populates="incident") 
    victims = relationship(lambda:Victims, back_populates="incident")
    properties = relationship(lambda:Properties, back_populates="incident")
    hostages = relationship(lambda:Hostages, back_populates="incident")
    international_data = relationship(lambda:International_Data, back_populates="incident")
    other_data = relationship(lambda:Other_Data, back_populates="incident")


class Country(Base):
    __tablename__ = 'country'
    id = Column(Integer, primary_key=True)
    country_txt = Column(String, nullable=False)

    region = relationship(lambda:Region, back_populates="country")

class Region(Base):
    __tablename__ = 'region'
    id = Column(Integer, primary_key=True)
    region_txt = Column(String, nullable=False)
    country_id = Column(Integer, ForeignKey('country.id'))

    city = relationship(lambda:City, back_populates="region")
    country = relationship(lambda:Country, back_populates="region")

class City(Base):
    __tablename__ = 'city'
    id = Column(Integer, primary_key=True)
    city = Column(String, nullable=False)
    region_id = Column(Integer, ForeignKey('region.id'))

    incident = relationship(lambda:Incident, back_populates="city")
    region = relationship(lambda:Region, back_populates="city")

class PlaceSpec(Base):
    __tablename__ = 'place_spec'
    id = Column(Integer, primary_key=True)
    latitude = Column(Float)
    longitude = Column(Float)
    specificity = Column(Integer)
    vicinity = Column(Boolean, nullable=True)  # Допускаем NaN из CSV
    location = Column(String)
    summary = Column(Text)
    motive = Column(Text)

    incident = relationship(lambda:Incident, back_populates="place_spec")

class AttackSpec(Base):
    __tablename__ = 'attack_spec'
    id = Column(Integer, primary_key=True)
    crit1 = Column(Boolean, nullable=True)
    crit2 = Column(Boolean, nullable=True)
    crit3 = Column(Boolean, nullable=True)
    doubtterr = Column(Boolean, nullable=True)
    multiple = Column(Boolean, nullable=True)
    success = Column(Boolean, nullable=True)
    suicide = Column(Boolean, nullable=True)

    incident = relationship(lambda:Incident, back_populates="attack_spec")

class Victims(Base):
    __tablename__ = 'victims'
    id = Column(Integer, primary_key=True)
    nkill = Column(Integer)
    nkillus = Column(Integer)
    nkillter = Column(Integer)
    nwound = Column(Integer)
    nwoundus = Column(Integer)
    nwoundte = Column(Integer)
    incident_id = Column(Integer, ForeignKey('incident.id'))
    
    incident = relationship(lambda:Incident, back_populates="victims")

class Properties(Base):
    __tablename__ = 'properties'
    id = Column(Integer, primary_key=True)
    property_ = Column(Boolean, nullable=False)
    propvalue = Column(Integer, nullable=False)
    propcomment = Column(Text, nullable=False)

    incident_id = Column(Integer, ForeignKey('incident.id'))
    incident = relationship(lambda:Incident, back_populates="properties")

class Hostages(Base):
    __tablename__ = 'hostages'
    id = Column(Integer, primary_key=True)
    ishostkid = Column(Boolean, nullable=False)
    nhostkid = Column(Integer)
    nhostkidus = Column(Integer)
    ndays = Column(Integer)
    ransom = Column(Boolean)
    ransomamt = Column(Integer)
    ransompaid = Column(Integer)
    hostkidoutcome_txt = Column(String, nullable=False)

    incident_id = Column(Integer, ForeignKey('incident.id'))

    incident = relationship(lambda:Incident, back_populates="hostages")

class International_Data(Base):
    __tablename__ = 'international_data'
    id = Column(Integer, primary_key=True)
    INT_LOG = Column(Boolean, nullable=False)
    INT_IDEO = Column(Boolean, nullable=False)
    INT_MISC = Column(Boolean, nullable=False)
    INT_ANY = Column(Boolean, nullable=False)

    incident_id = Column(Integer, ForeignKey('incident.id'))
    incident = relationship(lambda:Incident, back_populates="international_data")

class Type_Incident(Base):
    __tablename__ = 'type_incident'
    
    id = Column(Integer, primary_key=True)   # заполняется из alternative
    alternative_txt = Column(String, nullable=False)

    incident = relationship(lambda:Incident, back_populates="type_incident")
    
class Type_Attack(Base):
    __tablename__ = 'type_attack'
    
    id = Column(Integer, primary_key=True)   # заполняется из attacktype1
    attacktype1_txt = Column(String, nullable=False)

    incident = relationship(lambda:Incident, back_populates="type_attack")

class Target_Type(Base):
    __tablename__ = 'target_type'
    
    id = Column(Integer, primary_key=True)   # заполняется из targtype1
    targtype1_txt = Column(String, nullable=False)

    target_sub_type = relationship(lambda:Target_Sub_Type, back_populates="target_type")


class Target_Sub_Type(Base):
    __tablename__ = 'target_sub_type'
    
    id = Column(Integer, primary_key=True)   # заполняется из targsubtype1
    targsubtype1_txt = Column(String, nullable=False)

    target_type_id = Column(Integer, ForeignKey('target_type.id'))
    target_type = relationship(lambda:Target_Type, back_populates="target_sub_type")
    incident = relationship(lambda:Incident, back_populates="target_sub_type")

class Corp(Base):
    __tablename__ = 'corp'
    
    id = Column(Integer, primary_key=True)   # cвои
    corp1 = Column(String, nullable=False)

    incident = relationship(lambda:Incident, back_populates="corp")

class Target(Base):
    __tablename__ = 'target'
    
    id = Column(Integer, primary_key=True)   # cвои
    target1 = Column(String, nullable=False)

    incident = relationship(lambda:Incident, back_populates="target")

class Natlty(Base):
    __tablename__ = 'natlty'
    
    id = Column(Integer, primary_key=True)   # заполняется из natlty1
    natlty1_txt = Column(String, nullable=False)

    incident = relationship(lambda:Incident, back_populates="natlty")

class Claimmode(Base):
    __tablename__ = 'claimmode'
    
    id = Column(Integer, primary_key=True)   # заполняется из claimmode
    claimmode_txt = Column(String, nullable=False)

    incident = relationship(lambda:Incident, back_populates="claimmode")

class Weapon_Type(Base):
    __tablename__ = 'weapon_type'
    
    id = Column(Integer, primary_key=True)   # заполняется из weaptype1
    weaptype1_txt = Column(String, nullable=False)

    weapon_sub_type = relationship(lambda:Weapon_Sub_Type, back_populates="weapon_type")
    
class Weapon_Sub_Type(Base):
    __tablename__ = 'weapon_sub_type'
    
    id = Column(Integer, primary_key=True)   # заполняется из weapsubtype1
    weapsubtype1_txt = Column(String, nullable=False)

    weapon_type_id = Column(Integer, ForeignKey('weapon_type.id'))
    weapon_type = relationship(lambda:Weapon_Type, back_populates="weapon_sub_type")
    incident = relationship(lambda:Incident, back_populates="weapon_sub_type")

class Propextent(Base):
    __tablename__ = 'propextent'
    
    id = Column(Integer, primary_key=True)   # свои
    propextent_txt = Column(String, nullable=False)

    incident = relationship(lambda:Incident, back_populates="propextent")

class Other_Data(Base):
    __tablename__ = 'other_data'

    id = Column(Integer, primary_key=True)   # свои
    addnotes = Column(Text, nullable=False)
    scite1 = Column(Text, nullable=False)
    scite2 = Column(Text, nullable=False)
    scite3 = Column(Text, nullable=False)

    incident_id = Column(Integer, ForeignKey('incident.id'))
    incident = relationship(lambda:Incident, back_populates="other_data")

engine = create_engine("sqlite:///content/incident.db")



In [8]:
# Выполняется один раз для создания файла базы. После этого комментируем строку назад
# Base.metadata.create_all(engine)

In [9]:
# Функция для проверки все ли в порядке с релейшеншипами в таблицах (если не выдает ошибки то все пучком)
# configure_mappers()

In [41]:
Session = sessionmaker(bind=engine)

class ORM:

    @staticmethod
    def fill_countries():
        """Заполняем таблицу стран"""
        with Session() as session:
            unique_countries = df[['country', 'country_txt']].drop_duplicates()
            country_data = [
                {"id": row["country"], "country_txt": row["country_txt"]}
                for _, row in unique_countries.iterrows()
            ]
            session.bulk_insert_mappings(Country, country_data)
            session.commit()

    @staticmethod
    def fill_regions():
        with Session() as session:
            # Загружаем существующие страны в словарь {id страны: id страны}
            country_ids = {c.id: c.id for c in session.query(Country).all()}
    
            # Убираем дубликаты по region, оставляя country (id страны)
            unique_regions = df[['region', 'region_txt', 'country']].drop_duplicates(subset=['region'])
    
            region_data = [
                {
                    "id": row["region"],
                    "region_txt": row["region_txt"],
                    "country_id": country_ids.get(row["country"])  # country - это id
                }
                for _, row in unique_regions.iterrows()
                if row["country"] in country_ids  
            ]
    
   
            session.bulk_insert_mappings(Region, region_data)
            session.commit()

    @staticmethod
    def fill_cities():
        """Заполняем таблицу городов, привязывая к регионам"""
        with Session() as session:
            # Загружаем регионы в словарь {id региона: id региона}
            region_ids = {r.id: r.id for r in session.scalars(select(Region))}
    
            # Фильтруем уникальные города, удаляем пустые и NaN
            unique_cities = df[['city', 'region']].drop_duplicates()
            unique_cities = unique_cities.dropna(subset=['city'])  # Удаляем NaN в 'city'
            unique_cities = unique_cities[unique_cities['city'].str.strip() != ""]  # Убираем пустые строки
    
            city_data = [
                {
                    "city": row["city"],
                    "region_id": region_ids.get(row["region"])
                }
                for _, row in unique_cities.iterrows()
                if row["region"] in region_ids  # Исключаем несуществующие регионы
            ]
    
   
            session.bulk_insert_mappings(City, city_data)
            session.commit()

In [16]:
# Выполняем заполнение (т.е заполнение из датафрема- операции одноразовые)
# ORM.fill_countries()

In [None]:
# ORM.fill_regions()

In [None]:
# ORM.fill_cities()

In [12]:
# это вообще заготовка кода))

# with Session() as session:
#     for _, row in df.iterrows():
#         # Получаем ID всех зависимых данных
#         city = session.query(City).filter_by(city=row["city"]).first()
#         place_spec = session.query(Place_Spec).filter_by(latitude=row["latitude"], longitude=row["longitude"]).first()
#         attack_spec = session.query(Attack_Spec).filter_by(crit1=row["crit1"], crit2=row["crit2"], crit3=row["crit3"]).first()
#         # Аналогично получаем остальные связи...

#         # Создаем новый инцидент
#         incident = Incident(
#             iyear=row["iyear"],
#             imonth=row["imonth"],
#             iday=row["iday"],
#             related=row["related"],
#             city_id=city.id if city else None,  # Проверяем, есть ли город
#             place_spec_id=place_spec.id if place_spec else None,
#             attack_spec_id=attack_spec.id if attack_spec else None,
#             # Заполняем остальные поля...
#         )
#         session.add(incident)

#     session.commit()  # Сохраняем все инциденты сразу
