# Ukraine Conflict Text Analysis with Google Drive Integration

This notebook processes the Ukraine conflict data, creates TF-IDF matrices and PCA results, and uploads them to Google Drive.

## Install Required Packages

In [7]:
# Install required packages
!pip install pandas numpy nltk scikit-learn google-auth google-auth-oauthlib google-auth-httplib2 google-api-python-client

Collecting google-auth
  Using cached google_auth-2.38.0-py2.py3-none-any.whl.metadata (4.8 kB)
Collecting google-auth-oauthlib
  Downloading google_auth_oauthlib-1.2.1-py2.py3-none-any.whl.metadata (2.7 kB)
Collecting google-auth-httplib2
  Using cached google_auth_httplib2-0.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting google-api-python-client
  Using cached google_api_python_client-2.166.0-py2.py3-none-any.whl.metadata (6.6 kB)
Collecting rsa<5,>=3.1.4 (from google-auth)
  Using cached rsa-4.9-py3-none-any.whl.metadata (4.2 kB)
Collecting requests-oauthlib>=0.7.0 (from google-auth-oauthlib)
  Downloading requests_oauthlib-2.0.0-py2.py3-none-any.whl.metadata (11 kB)
Collecting httplib2>=0.19.0 (from google-auth-httplib2)
  Using cached httplib2-0.22.0-py3-none-any.whl.metadata (2.6 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0,>=1.31.5 (from google-api-python-client)
  Using cached google_api_core-2.24.2-py3-none-any.whl.metadata (3.0 kB)
Collecting ur

## Import Libraries

In [8]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.util import bigrams
import re
import datetime
import time
import os
import io

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /Users/admin/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /Users/admin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/admin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.http import MediaInMemoryUpload

ROOT_FOLDER_ID = "1kN_F4168tJQGDnHcC6G7-Lp_Y1Lq2t5A"  # Replace with your folder ID
SERVICE_ACCOUNT_FILE = '/home/ubuntu/jn/air-alarms-data-506614e0f8b8.json'  # Replace with your file path
SCOPES = ['https://www.googleapis.com/auth/drive']

drive_service = None
if os.path.exists(SERVICE_ACCOUNT_FILE):
    credentials = service_account.Credentials.from_service_account_file(
        SERVICE_ACCOUNT_FILE, scopes=SCOPES
    )
    drive_service = build('drive', 'v3', credentials=credentials)
    print("Google Drive service initialized successfully.")
else:
    print(f"Warning: Service account file not found at {SERVICE_ACCOUNT_FILE}")
    print("You'll need to update the file path or use Option 1 authentication instead.")

You'll need to update the file path or use Option 1 authentication instead.


## Helper Functions

In [10]:
def safe_create_or_get_folder(folder_name, parent_id=None, retries=3):
    for attempt in range(retries):
        try:
            return create_or_get_folder(folder_name, parent_id)
        except Exception as e:
            print(f"Attempt {attempt+1} failed: {e}")
            time.sleep(2)
    raise RuntimeError(f"Failed to create folder '{folder_name}' after {retries} attempts.")

def create_or_get_folder(folder_name, parent_id=None):
    query = f"name='{folder_name}' and mimeType='application/vnd.google-apps.folder'"
    if parent_id:
        query += f" and '{parent_id}' in parents"
    results = drive_service.files().list(q=query, fields="files(id, name)").execute()
    folders = results.get('files', [])
    if folders:
        return folders[0]['id']
    file_metadata = {
        'name': folder_name,
        'mimeType': 'application/vnd.google-apps.folder',
        'parents': [parent_id] if parent_id else []
    }
    folder = drive_service.files().create(body=file_metadata, fields='id').execute()
    return folder['id']

def upload_csv_to_drive(folder_id, file_name, df):
    content = df.to_csv(index=False)
    media = MediaInMemoryUpload(content.encode(), mimetype='text/csv')
    query = f"name='{file_name}' and '{folder_id}' in parents"
    existing_files = drive_service.files().list(q=query, fields="files(id)").execute().get('files', [])
    if existing_files:
        file_id = existing_files[0]['id']
        drive_service.files().update(fileId=file_id, media_body=media).execute()
        print(f"Updated existing file: {file_name}")
    else:
        file_metadata = {
            'name': file_name,
            'parents': [folder_id],
            'mimeType': 'text/csv'
        }
        drive_service.files().create(body=file_metadata, media_body=media).execute()
        print(f"Created new file: {file_name}")

Load and Prepare Data

In [14]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

custom_stop_phrases = [
    "angelica evan", "christina harward", "click read", "click see", "et click",
    "frederick kagan", "full report", "karolina hird", "isw assess", "isw continu",
    "isw cover", "isw interact", "isw observ", "isw previous", "kateryna stepanenko",
    "key takeaway", "map present", "map russian", "map updat", "nicol wolkov", "pm et",
    "present report", "previous assess", "previous report", "read full", "riley bailey",
    "see isw", "static map", "takeaway russian", "timelaps map"
]

stemmed_custom_phrases = [
    ' '.join([stemmer.stem(word) for word in phrase.split()])
    for phrase in custom_stop_phrases
]

df = pd.read_csv("ukraine_conflict_updates.csv")
df = df.dropna(subset=['content'])
print(df.head(10))


def remove_stemmed_phrases(tokens, phrase_list):
    text = ' '.join(tokens)
    for phrase in phrase_list:
        text = re.sub(rf"\b{re.escape(phrase)}\b", "", text)
    return text.split()

def clean_and_show_tokens(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = text.lower()
    tokens = word_tokenize(text)

clean_and_show_tokens(df['content'].iloc[0])

def clean_and_stem(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = text.lower()
    
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]
    stemmed = [stemmer.stem(token) for token in tokens]
    cleaned_tokens = remove_stemmed_phrases(stemmed, stemmed_custom_phrases)
    
    return cleaned_tokens

def bigram_tokenizer(text):
    tokens = clean_and_stem(text)
    return [' '.join(bg) for bg in bigrams(tokens)]

         date                                            content
0  05-04-2025  Click here to read the full report with maps O...
1  04-04-2025  Click here to read the full report with maps K...
2  03-04-2025  Click here to read the full report. Nicole Wol...
3  02-04-2025  Click here to read the full report Angelica Ev...
4  01-04-2025  Click here to read the full report. Angelica E...
5  31-03-2025  Click here to read the full report. Nicole Wol...
6  30-03-2025  Click here to read the full report with maps O...
7  29-03-2025  Click here to read the full report with maps N...
8  28-03-2025  Click here to read the full report with maps A...
9  27-03-2025  Click here to read the full report. Nicole Wol...


## Text Processing Functions

In [15]:
print(df.head())

         date                                            content
0  05-04-2025  Click here to read the full report with maps O...
1  04-04-2025  Click here to read the full report with maps K...
2  03-04-2025  Click here to read the full report. Nicole Wol...
3  02-04-2025  Click here to read the full report Angelica Ev...
4  01-04-2025  Click here to read the full report. Angelica E...


## Create TF-IDF Matrix

In [16]:
vectorizer = TfidfVectorizer(tokenizer=bigram_tokenizer, max_features=200)
X = vectorizer.fit_transform(df['content'])

feature_names = vectorizer.get_feature_names_out()
tfidf_matrix = pd.DataFrame(X.toarray(), columns=feature_names)
tfidf_matrix.insert(0, 'date', df['date'].values)
tfidf_matrix.to_csv("bigram_tfidf_matrix.csv", index=False)

print(f"Created TF–IDF matrix with {tfidf_matrix.shape[0]} rows and {tfidf_matrix.shape[1]} columns")
print(X.shape)



Created TF–IDF matrix with 1125 rows and 201 columns
(1125, 200)


## Perform PCA

In [17]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X.toarray())

pca_df = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
pca_df.insert(0, 'date', df['date'].values)  
pca_df.to_csv("pca_tfidf_result.csv", index=False)

print(f"Created PCA result with {pca_df.shape[0]} rows and {pca_df.shape[1]} columns")


Created PCA result with 1125 rows and 3 columns


## Upload Files to Google Drive

In [18]:
def upload_files_to_drive_service_account():
    if drive_service is None:
        print("Google Drive service not initialized. Check your service account credentials.")
        return

    try:
        results_folder_id = safe_create_or_get_folder("isw_vectors", parent_id=ROOT_FOLDER_ID)

        upload_csv_to_drive(results_folder_id, "bigram_tfidf_matrix.csv", tfidf_matrix)
        upload_csv_to_drive(results_folder_id, "pca_tfidf_result.csv", pca_df)

        print(f"All files uploaded successfully to folder: isw_vectors")

    except Exception as e:
        print(f"Error uploading files: {e}")

In [19]:
def upload_files_to_drive_colab():
    try:
        
        folder_path = f"/content/drive/MyDrive/isw_vectors"
        
        os.makedirs(folder_path, exist_ok=True)
        
        tfidf_matrix.to_csv(f"{folder_path}/bigram_tfidf_matrix.csv", index=False)
        pca_df.to_csv(f"{folder_path}/pca_tfidf_result.csv", index=False)
        
        print(f"All files uploaded successfully to Google Drive at: {folder_path}")
    except Exception as e:
        print(f"Error uploading files: {e}")

## Execute Upload to Google Drive

Choose and run one of the following cells depending on your authentication method:

In [11]:
upload_files_to_drive_service_account()

Error uploading files: name 'tfidf_matrix' is not defined
