In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


import nltk
from nltk.corpus import stopwords
import re


In [2]:
train_df = pd.read_csv('ans1.csv')
test_df = pd.read_csv('news_data1.csv')


In [3]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train = train_df['Title'].values
y_train = train_df['label'].values
X_test = test_df['Title'].values

# Tokenization and Padding
max_words = 10000  # Max vocabulary size
max_len = 20       # Max sequence length

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# CNN Model
model = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    Conv1D(128, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    Conv1D(64, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(10, activation='relu'),
    Dense(1, activation='sigmoid')  # Binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_split=0.2)
predictions = model.predict(X_test_pad)
predicted_labels = (predictions > 0.5).astype(int) 

Epoch 1/5




[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.8850 - loss: 0.6113 - val_accuracy: 1.0000 - val_loss: 0.0642
Epoch 2/5
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9835 - loss: 0.1748 - val_accuracy: 1.0000 - val_loss: 0.0020
Epoch 3/5
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9899 - loss: 0.0775 - val_accuracy: 1.0000 - val_loss: 0.0044
Epoch 4/5
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9651 - loss: 0.1709 - val_accuracy: 1.0000 - val_loss: 0.0320
Epoch 5/5
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9855 - loss: 0.0934 - val_accuracy: 1.0000 - val_loss: 0.0121
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


In [4]:
import pandas as pd
from nltk.corpus import stopwords
import nltk
import pickle
# Ensure stopwords are downloaded
nltk.download('stopwords')

word_index = tokenizer.word_index

stop_words = set(stopwords.words('english'))

filtered_word_index = {word: index for word, index in word_index.items() if word not in stop_words}

word_freq_df = pd.DataFrame(list(filtered_word_index.items()), columns=['Word', 'Index'])

most_common_words = word_freq_df.sort_values(by='Index').head(20)  # Fetch top 20 common words

print(most_common_words)


         Word  Index
0       check      6
1       video      8
2       noida     10
3       delhi     12
4    gurugram     13
5   bengaluru     14
6     chennai     15
7   ahmedabad     16
8          rs     17
9     details     18
10     mumbai     20
11  hyderabad     21
12       pune     22
13  ghaziabad     23
14    traffic     26
15          5     27
16    kolkata     31
17         pm     32
18          1     33
19    doctors     34


[nltk_data] Downloading package stopwords to C:\Users\Priyanshu
[nltk_data]     Upadhyay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Vectorize the text (Bag of Words)
vectorizer = CountVectorizer(max_features=5000)
X_train_vec_split = vectorizer.fit_transform(X_train_split)
X_val_vec_split = vectorizer.transform(X_val_split)

# Logistic Regression Model
lr_model = LogisticRegression()
lr_model.fit(X_train_vec_split, y_train_split)

lr_val_predictions = lr_model.predict(X_val_vec_split)

accuracy = accuracy_score(y_val_split, lr_val_predictions)
print(f"Logistic Regression Accuracy: {accuracy * 100:.2f}%")


Logistic Regression Accuracy: 98.73%


In [6]:
from sklearn.svm import SVC

# SVM Model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_vec_split, y_train_split)

svm_val_predictions = svm_model.predict(X_val_vec_split)

accuracy = accuracy_score(y_val_split, svm_val_predictions)
print(f"SVM Accuracy: {accuracy * 100:.2f}%")


SVM Accuracy: 98.73%


In [7]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train_vec_split, y_train_split)

rf_val_predictions = rf_model.predict(X_val_vec_split)

accuracy = accuracy_score(y_val_split, rf_val_predictions)
print(f"Random Forest Accuracy: {accuracy * 100:.2f}%")


Random Forest Accuracy: 98.73%


In [8]:
pickle.dump(rf_model,open("model.pkl","wb"))

In [9]:


# Remove duplicates
test_df = test_df.drop_duplicates()

# Filter rows where the predicted label is 1
# = test_df[test_df['label'] == 1]
test_df = pd.read_csv('ans1.csv')
test_df_filtered = test_df[test_df['label'] == 1]
test_df_filtered.to_csv('ans2.csv', index=False)

In [10]:
test_df_filtered

Unnamed: 0,Title,Links,label
37,Uttar Pradesh: Building Collapses In Meerut's ...,https://www.timesnownews.com/india/uttar-prade...,1
80,Will Pacific Cyclone Yagi Bring More Rainfall...,https://www.timesnownews.com/delhi/will-cyclon...,1
147,IAF Helicopter Makes Emergency Landing On Padd...,https://www.timesnownews.com/chennai/iaf-helic...,1
153,"In Chennai Schools, Maha Vishnu Talked About R...",https://www.timesnownews.com/chennai/in-chenna...,1
223,Ahmedabad On Yellow Alert For Thunderstorm Ami...,https://www.timesnownews.com/city/ahmedabad/ah...,1
231,Ahmedabad Braces for Light Rain and Cloudy Ski...,https://www.timesnownews.com/city/ahmedabad/ah...,1
251,Video: Major Landslide Hits Jammu-Rajouri-Poon...,https://www.timesnownews.com/india/video-major...,1
275,Will Pacific Cyclone Yagi Bring More Rainfall ...,https://www.timesnownews.com/delhi/will-cyclon...,1


In [11]:
import pymongo
from pymongo import MongoClient

# Connect to MongoDB
client = MongoClient('mongodb+srv://Priyanshu23u:24681012@cluster0.fyvfy.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0')  # Adjust connection string as needed

# Access the database and collection
db = client['sih1']
collection = db['news1']

# Remove previous records from the collection
delete_result = collection.delete_many({})  # Deletes all documents in the collection
print(f"Deleted {delete_result.deleted_count} records from 'news1' collection.")

# Assuming 'test_df_filtered' is your DataFrame that you want to insert
data_dict = test_df_filtered.to_dict("records")  # Convert DataFrame to list of dictionaries

# Insert the new data into the collection
insert_doc = collection.insert_many(data_dict)

# Print inserted IDs (list of document IDs)
print(f"Inserted {len(insert_doc.inserted_ids)} new records.")
print(f"Inserted IDs: {insert_doc.inserted_ids}")

# Close the connection
client.close()


Deleted 8 records from 'news1' collection.
Inserted 8 new records.
Inserted IDs: [ObjectId('66f1c6cf3cc256c9ecd31b55'), ObjectId('66f1c6cf3cc256c9ecd31b56'), ObjectId('66f1c6cf3cc256c9ecd31b57'), ObjectId('66f1c6cf3cc256c9ecd31b58'), ObjectId('66f1c6cf3cc256c9ecd31b59'), ObjectId('66f1c6cf3cc256c9ecd31b5a'), ObjectId('66f1c6cf3cc256c9ecd31b5b'), ObjectId('66f1c6cf3cc256c9ecd31b5c')]


In [12]:
from pymongo import MongoClient
import requests
from bs4 import BeautifulSoup
import pytesseract
from PIL import Image
from io import BytesIO
import pandas as pd

# Connect to MongoDB
client = MongoClient('mongodb+srv://Priyanshu23u:24681012@cluster0.fyvfy.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0')
db = client['sih1']
collection = db['tweets']

# Extract data from MongoDB
tweets_data = list(collection.find())

# Convert the extracted data into a DataFrame
df = pd.DataFrame(tweets_data)

# Combine 'createdAt' and 'fullText' into a new column 'content'
df['content'] = df['createdAt'].astype(str) + ': ' + df['fullText']

# Drop the original 'createdAt' and 'fullText' columns
df.drop(columns=['createdAt', 'fullText'], inplace=True)

# Function to extract text from a webpage
def extract_text_from_url(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup.get_text().strip()
    except Exception as e:
        print(f"Error extracting text from URL {url}: {e}")
        return ""

# Function to extract text from an image using OCR
def extract_image_text(image_url):
    try:
        response = requests.get(image_url)
        img = Image.open(BytesIO(response.content))
        text = pytesseract.image_to_string(img)
        return text.strip()
    except Exception as e:
        print(f"Error extracting text from image URL {image_url}: {e}")
        return ""

# Function to check and extract content from image or video URLs
def process_media_url(url):
    # Check for image content
    if any(ext in url for ext in ['.jpg', '.jpeg', '.png']):
        return extract_image_text(url)
    
    # Placeholder for video content extraction
    if any(ext in url for ext in ['.mp4', '.mov', 'video']):
        return "Extracted video content (placeholder)"
    
    return ""

# Extract content from URLs if present and merge into 'content'
for index, row in df.iterrows():
    url = row.get('url', '')
    if url:
        text_content = extract_text_from_url(url)
        media_content = process_media_url(url)
        
        # Merge extracted web and media content into 'content' column
        df.at[index, 'content'] += f"\nExtracted Web Content: {text_content}"
        if media_content:
            df.at[index, 'content'] += f"\nExtracted Media Content: {media_content}"

# Display the final DataFrame
print(df)



print("Data has been successfully processed and inserted into the 'tweetcontents' collection.")


                         _id                   id lang  \
0   66e734987eabf84a8810e4dd  1835379492665405623   en   
1   66e734987eabf84a8810e4de  1835367153773531441   en   
2   66e734987eabf84a8810e4df  1835359759802724555   ml   
3   66e734987eabf84a8810e4e0  1835341344094482649   hi   
4   66e734987eabf84a8810e4e1  1835335175862985002   en   
5   66e734987eabf84a8810e4e2  1835333325109526970   en   
6   66e734987eabf84a8810e4e3  1835323661504413727   en   
7   66e734987eabf84a8810e4e4  1835322958774177929   en   
8   66e734987eabf84a8810e4e5  1835299259346010590   hu   
9   66e734987eabf84a8810e4e6  1835288586729509083   ar   
10  66e734987eabf84a8810e4e7  1835284032688378342   ta   
11  66e734987eabf84a8810e4e8  1835275413892825154   hi   
12  66e734987eabf84a8810e4e9  1835274680413860016   en   
13  66e734987eabf84a8810e4ea  1835273533389201563   en   
14  66e734987eabf84a8810e4eb  1835267558674465245   en   
15  66e734987eabf84a8810e4ec  1835234417188421737   tl   
16  66e734987e

In [13]:
# Import necessary libraries
from pymongo import MongoClient
import pandas as pd

# Connect to MongoDB



In [14]:
import re
import spacy
import datefinder
import dateparser
import geopy
from geopy.geocoders import Nominatim
# Load spaCy's English model
nlp = spacy.load('en_core_web_sm')

non_indian_countries = [
    'Vietnam', 'Myanmar', 'USA', 'China', 'Japan', 'UK', 'Germany', 'France',
    'Brazil', 'Canada', 'Russia', 'Australia', 'Pakistan', 'Sri Lanka', 'Nepal', 
    'Bangladesh', 'Thailand', 'Singapore', 'Indonesia', 'Mexico', 'South Korea'
]

def extract_location(text):
    """Extract the first two locations (GPE) using spaCy's NER and filter out country names."""
    doc = nlp(text)
    locations = []
    for ent in doc.ents:
        if ent.label_ == 'GPE' and ent.text not in non_indian_countries:
            locations.append(ent.text)
    
    # Return only the first two locations if available
    return ', '.join(locations[:2]) if locations else None
def extract_first_date(text):
    """Extract the first date from the given text using regular expressions and dateparser."""
    # Regular expression pattern to match common date formats
    date_pattern = r'\b(?:[A-Z][a-z]+ \d{1,2}(?:, \d{4})?|(?:\d{4}-\d{2}-\d{2}))\b'

    
    # Find all matches
    matches = re.findall(date_pattern, text)
    
    if matches:
        # Use dateparser to parse the first match
        parsed_date = dateparser.parse(matches[0], languages=['en'])
        if parsed_date:
            return parsed_date.strftime('%Y-%m-%d')
    
    return None
def extract_date_time(text):
    """Extract date and time using the extract_first_date function."""
    return extract_first_date(text)

def extract_disaster_type(text, disaster_keywords):
    """Extract disaster type based on a predefined list."""
    disaster = [disaster for disaster in disaster_keywords if disaster in text.lower()]
    return disaster[0] if disaster else "Unknown"

def extract_short_description(text):
    """Extract a short description from the text, limited to two lines."""
    # Split text into sentences and keep only the first two
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    short_desc = ' '.join(sentences[:2])  # Join the first two sentences
    return short_desc

def process_df(df):
    """Apply extraction functions to the DataFrame."""
    disaster_keywords = [
        'earthquake', 'flood', 'hurricane', 'tornado', 'tsunami', 
        'volcano', 'cyclone', 'wildfire', 'landslide', 'avalanche',
        'drought', 'heatwave', 'blizzard', 'storm', 'typhoon', 
        'hailstorm', 'mudslide', 'sandstorm', 'tremor', 'aftershock'
    ]  # Example disaster types

    df['location'] = df['content'].apply(extract_location)  # Extract first two locations
    df['date'] = df['content'].apply(extract_date_time)  # Extract date
    df['disaster_type'] = df['content'].apply(lambda x: extract_disaster_type(x, disaster_keywords))  # Extract disaster type
    df['short_description'] = df['content'].apply(extract_short_description)  # Extract short description (2 lines)
    
    # Drop the 'date_time' column if it exists
    if 'date_time' in df.columns:
        df = df.drop(columns=['date_time'])
    
    return df




In [15]:
processed_df = process_df(df)
print(processed_df)


                         _id                   id lang  \
0   66e734987eabf84a8810e4dd  1835379492665405623   en   
1   66e734987eabf84a8810e4de  1835367153773531441   en   
2   66e734987eabf84a8810e4df  1835359759802724555   ml   
3   66e734987eabf84a8810e4e0  1835341344094482649   hi   
4   66e734987eabf84a8810e4e1  1835335175862985002   en   
5   66e734987eabf84a8810e4e2  1835333325109526970   en   
6   66e734987eabf84a8810e4e3  1835323661504413727   en   
7   66e734987eabf84a8810e4e4  1835322958774177929   en   
8   66e734987eabf84a8810e4e5  1835299259346010590   hu   
9   66e734987eabf84a8810e4e6  1835288586729509083   ar   
10  66e734987eabf84a8810e4e7  1835284032688378342   ta   
11  66e734987eabf84a8810e4e8  1835275413892825154   hi   
12  66e734987eabf84a8810e4e9  1835274680413860016   en   
13  66e734987eabf84a8810e4ea  1835273533389201563   en   
14  66e734987eabf84a8810e4eb  1835267558674465245   en   
15  66e734987eabf84a8810e4ec  1835234417188421737   tl   
16  66e734987e

In [16]:
import pymongo
from pymongo import MongoClient

# Connect to MongoDB
client = MongoClient('mongodb+srv://Priyanshu23u:24681012@cluster0.fyvfy.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0')

# Access the database and collection
db = client['sih1']  # Use your database name
collection = db['final_tweet']  # Use your collection name

# Remove previous records from the collection
delete_result = collection.delete_many({})  # Deletes all documents in the collection
print(f"Deleted {delete_result.deleted_count} records from 'final_tweet' collection.")

# Data to insert (Example)
data_dict = processed_df.to_dict("records")  # Assuming df_processed is the DataFrame

# Insert the new data into the collection
insert_doc = collection.insert_many(data_dict)

# Print inserted IDs (list of document IDs)
print(f"Inserted {len(insert_doc.inserted_ids)} new records.")
print(f"Inserted IDs: {insert_doc.inserted_ids}")

# Close the MongoDB connection
client.close()


Deleted 35 records from 'final_tweet' collection.
Inserted 35 new records.
Inserted IDs: [ObjectId('66e734987eabf84a8810e4dd'), ObjectId('66e734987eabf84a8810e4de'), ObjectId('66e734987eabf84a8810e4df'), ObjectId('66e734987eabf84a8810e4e0'), ObjectId('66e734987eabf84a8810e4e1'), ObjectId('66e734987eabf84a8810e4e2'), ObjectId('66e734987eabf84a8810e4e3'), ObjectId('66e734987eabf84a8810e4e4'), ObjectId('66e734987eabf84a8810e4e5'), ObjectId('66e734987eabf84a8810e4e6'), ObjectId('66e734987eabf84a8810e4e7'), ObjectId('66e734987eabf84a8810e4e8'), ObjectId('66e734987eabf84a8810e4e9'), ObjectId('66e734987eabf84a8810e4ea'), ObjectId('66e734987eabf84a8810e4eb'), ObjectId('66e734987eabf84a8810e4ec'), ObjectId('66e734987eabf84a8810e4ed'), ObjectId('66e734987eabf84a8810e4ee'), ObjectId('66e734987eabf84a8810e4ef'), ObjectId('66e9ddd352f651c81275b7dd'), ObjectId('66e9ddd352f651c81275b7de'), ObjectId('66e9ddd352f651c81275b7df'), ObjectId('66e9ddd352f651c81275b7e0'), ObjectId('66e9ddd352f651c81275b7e1')

In [17]:
from pymongo import MongoClient

# Connect to MongoDB
client = MongoClient('mongodb+srv://Priyanshu23u:24681012@cluster0.fyvfy.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0')
db = client['sih1']
collection = db['newscontents']

news_data = list(collection.find())

# Convert the extracted data into a DataFrame
df1 = pd.DataFrame(news_data)


# Display the DataFrame
print(df1)
print("Data has been successfully inserted into the 'newscontents' collection.")


                         _id  \
0   66e5d05e690d95503b693fa8   
1   66e5d05e690d95503b693faa   
2   66e5d05f690d95503b693fae   
3   66e5d05f690d95503b693fb0   
4   66e5d05f690d95503b693fb2   
5   66e5d05f690d95503b693fb4   
6   66e5d05f690d95503b693fb6   
7   66e9df92826af9195115379e   
8   66e9df93826af919511537a0   
9   66e9df93826af919511537a6   
10  66e9df94826af919511537a8   
11  66e9df94826af919511537aa   
12  66e9df94826af919511537ac   

                                                 link  \
0   https://www.timesnownews.com/india/uttar-prade...   
1   https://www.timesnownews.com/delhi/will-cyclon...   
2   https://www.timesnownews.com/chennai/in-chenna...   
3   https://www.timesnownews.com/city/ahmedabad/ah...   
4   https://www.timesnownews.com/city/ahmedabad/ah...   
5   https://www.timesnownews.com/india/video-major...   
6   https://www.timesnownews.com/delhi/will-cyclon...   
7   https://www.timesnownews.com/india/uttar-prade...   
8   https://www.timesnownews.com/delhi

In [18]:
processed_df1 = process_df(df1)
print(processed_df1)

                         _id  \
0   66e5d05e690d95503b693fa8   
1   66e5d05e690d95503b693faa   
2   66e5d05f690d95503b693fae   
3   66e5d05f690d95503b693fb0   
4   66e5d05f690d95503b693fb2   
5   66e5d05f690d95503b693fb4   
6   66e5d05f690d95503b693fb6   
7   66e9df92826af9195115379e   
8   66e9df93826af919511537a0   
9   66e9df93826af919511537a6   
10  66e9df94826af919511537a8   
11  66e9df94826af919511537aa   
12  66e9df94826af919511537ac   

                                                 link  \
0   https://www.timesnownews.com/india/uttar-prade...   
1   https://www.timesnownews.com/delhi/will-cyclon...   
2   https://www.timesnownews.com/chennai/in-chenna...   
3   https://www.timesnownews.com/city/ahmedabad/ah...   
4   https://www.timesnownews.com/city/ahmedabad/ah...   
5   https://www.timesnownews.com/india/video-major...   
6   https://www.timesnownews.com/delhi/will-cyclon...   
7   https://www.timesnownews.com/india/uttar-prade...   
8   https://www.timesnownews.com/delhi

In [19]:
import pymongo
from pymongo import MongoClient

# Connect to MongoDB
client = MongoClient('mongodb+srv://Priyanshu23u:24681012@cluster0.fyvfy.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0')  # Adjust connection string as needed

# Access the database and collection
db = client['sih1']  # Use your database name
collection = db['final_news']  # Use your collection name

# Remove previous records from the collection
delete_result = collection.delete_many({})  # Deletes all documents in the collection
print(f"Deleted {delete_result.deleted_count} records from 'final_news' collection.")
# Data to insert (Example)
data_dict =processed_df1.to_dict("records")

# Insert the new data into the collection
insert_doc = collection.insert_many(data_dict)

# Print inserted IDs (list of document IDs)
print(f"Inserted IDs: {insert_doc.inserted_ids}")

# Close the MongoDB connection
client.close()


Deleted 13 records from 'final_news' collection.
Inserted IDs: [ObjectId('66e5d05e690d95503b693fa8'), ObjectId('66e5d05e690d95503b693faa'), ObjectId('66e5d05f690d95503b693fae'), ObjectId('66e5d05f690d95503b693fb0'), ObjectId('66e5d05f690d95503b693fb2'), ObjectId('66e5d05f690d95503b693fb4'), ObjectId('66e5d05f690d95503b693fb6'), ObjectId('66e9df92826af9195115379e'), ObjectId('66e9df93826af919511537a0'), ObjectId('66e9df93826af919511537a6'), ObjectId('66e9df94826af919511537a8'), ObjectId('66e9df94826af919511537aa'), ObjectId('66e9df94826af919511537ac')]


In [20]:
processed_df1.to_csv('processed_data.csv', index=False, date_format='%Y-%m-%d')


In [21]:
df1

Unnamed: 0,_id,link,content,__v,location,date,disaster_type,short_description
0,66e5d05e690d95503b693fa8,https://www.timesnownews.com/india/uttar-prade...,Meerut: A building collapsed in Zakir Colony o...,0,"Zakir, Meerut",,Unknown,Meerut: A building collapsed in Zakir Colony o...
1,66e5d05e690d95503b693faa,https://www.timesnownews.com/delhi/will-cyclon...,"New Delhi: The remnants of Cyclone Yagi, origi...",0,"New Delhi, West Bengal",2024-09-16,cyclone,"New Delhi: The remnants of Cyclone Yagi, origi..."
2,66e5d05f690d95503b693fae,https://www.timesnownews.com/chennai/in-chenna...,"Chennai: Rebirth, firestorms, curing diseases ...",0,"Poyyamozhi, Saidapet",2024-08-28,storm,"Chennai: Rebirth, firestorms, curing diseases ..."
3,66e5d05f690d95503b693fb0,https://www.timesnownews.com/city/ahmedabad/ah...,Ahmedabad: Gujarat has been experiencing heavy...,0,"Gujarat, Ahmedabad",2024-09-10,storm,Ahmedabad: Gujarat has been experiencing heavy...
4,66e5d05f690d95503b693fb2,https://www.timesnownews.com/city/ahmedabad/ah...,Ahmedabad: A day after the rain fury in Gujara...,0,"Gujarat, Ahmedabad",2024-08-30,cyclone,Ahmedabad: A day after the rain fury in Gujara...
5,66e5d05f690d95503b693fb4,https://www.timesnownews.com/india/video-major...,Srinagar: A massive landslide hit the Jammu-Ra...,0,"Kinnaur, Shimla",,flood,Srinagar: A massive landslide hit the Jammu-Ra...
6,66e5d05f690d95503b693fb6,https://www.timesnownews.com/delhi/will-cyclon...,"New Delhi: The remnants of Cyclone Yagi, origi...",0,"New Delhi, West Bengal",2024-09-16,cyclone,"New Delhi: The remnants of Cyclone Yagi, origi..."
7,66e9df92826af9195115379e,https://www.timesnownews.com/india/uttar-prade...,"Meerut: Amid the heavy rainfall, a building co...",0,"Zakir, Meerut",,flood,"Meerut: Amid the heavy rainfall, a building co..."
8,66e9df93826af919511537a0,https://www.timesnownews.com/delhi/will-cyclon...,"New Delhi: The remnants of Cyclone Yagi, origi...",0,"New Delhi, West Bengal",2024-09-16,cyclone,"New Delhi: The remnants of Cyclone Yagi, origi..."
9,66e9df93826af919511537a6,https://www.timesnownews.com/city/ahmedabad/ah...,Ahmedabad: Gujarat has been experiencing heavy...,0,"Gujarat, Ahmedabad",2024-09-10,storm,Ahmedabad: Gujarat has been experiencing heavy...


In [22]:
processed_df

Unnamed: 0,_id,id,lang,url,__v,content,location,date,disaster_type,short_description
0,66e734987eabf84a8810e4dd,1835379492665405623,en,https://x.com/user/status/1835379492665405623,0,"2024-09-15 18:05:45: ""#India has sent 35 tons ...","India, India",2024-09-15,typhoon,"2024-09-15 18:05:45: ""#India has sent 35 tons ..."
1,66e734987eabf84a8810e4de,1835367153773531441,en,https://x.com/user/status/1835367153773531441,0,2024-09-15 17:16:43: Yaha itni Baarish ho rahe...,"Island, Monsoon https://t.co/DdJ23GFGkg",2024-09-15,cyclone,2024-09-15 17:16:43: Yaha itni Baarish ho rahe...
2,66e734987eabf84a8810e4df,1835359759802724555,ml,https://x.com/user/status/1835359759802724555,0,2024-09-15 16:47:20: യാഗി ചുഴലിക്കാറ്റ് നാശം വ...,JanamTv,2024-09-15,cyclone,2024-09-15 16:47:20: യാഗി ചുഴലിക്കാറ്റ് നാശം വ...
3,66e734987eabf84a8810e4e0,1835341344094482649,hi,https://x.com/user/status/1835341344094482649,0,2024-09-15 15:34:10: भारत ने बढ़ाया मदद का हाथ...,myanmar,2024-09-15,flood,2024-09-15 15:34:10: भारत ने बढ़ाया मदद का हाथ...
4,66e734987eabf84a8810e4e1,1835335175862985002,en,https://x.com/user/status/1835335175862985002,0,2024-09-15 15:09:39: #India has dispatched $1 ...,"India, India",2024-09-15,typhoon,2024-09-15 15:09:39: #India has dispatched $1 ...
5,66e734987eabf84a8810e4e2,1835333325109526970,en,https://x.com/user/status/1835333325109526970,0,2024-09-15 15:02:18: #2024 saw record breaking...,India,2024-09-15,heatwave,2024-09-15 15:02:18: #2024 saw record breaking...
6,66e734987eabf84a8810e4e3,1835323661504413727,en,https://x.com/user/status/1835323661504413727,0,2024-09-15 14:23:54: #APMetTWO 24D259PM\nGloba...,"Typhoon, India",2024-09-15,typhoon,2024-09-15 14:23:54: #APMetTWO 24D259PM\nGloba...
7,66e734987eabf84a8810e4e4,1835322958774177929,en,https://x.com/user/status/1835322958774177929,0,2024-09-15 14:21:06: India on Sunday sent urge...,"India, Laos",2024-09-15,typhoon,2024-09-15 14:21:06: India on Sunday sent urge...
8,66e734987eabf84a8810e4e5,1835299259346010590,hu,https://x.com/user/status/1835299259346010590,0,2024-09-15 12:46:56: Súlyos #weather események...,"Philippines, California",2024-09-15,flood,2024-09-15 12:46:56: Súlyos #weather események...
9,66e734987eabf84a8810e4e6,1835288586729509083,ar,https://x.com/user/status/1835288586729509083,0,2024-09-15 12:04:31: انهيار جبلى رهيب فى اوتار...,India,2024-09-15,landslide,2024-09-15 12:04:31: انهيار جبلى رهيب فى اوتار...


In [23]:
import re
import spacy
import datefinder
import dateparser

# Load spaCy's English model
nlp = spacy.load('en_core_web_sm')

import spacy
import geopy
from geopy.geocoders import Nominatim

# Load spaCy's English model
nlp = spacy.load('en_core_web_sm')

# Initialize Nominatim geolocator
geolocator = Nominatim(user_agent="geoapiExercises")

def is_indian_location(location):
    """Check if a location is in India using geopy."""
    try:
        # Use geopy to get the country of the location
        location_info = geolocator.geocode(location, language='en')
        if location_info and 'India' in location_info.address:
            return True
    except Exception as e:
        print(f"Error checking location {location}: {e}")
    return False

def extract_location(text):
    """Extract the first two locations (GPE) using spaCy's NER and keep only city or state names in India."""
    doc = nlp(text)
    locations = [ent.text for ent in doc.ents if ent.label_ == 'GPE']  # Extract location entities
    
    # Filter locations to remove non-Indian countries
    valid_locations = []
    for loc in locations:
        if is_indian_location(loc):  # Check if location is in India
            valid_locations.append(loc)  # Add only Indian locations (city/state)
    
    # Return the first two valid locations
    return ', '.join(valid_locations[:2]) if valid_locations else None

def extract_first_date(text):
    """Extract the first date from the given text using regular expressions and dateparser."""
    # Regular expression pattern to match common date formats
    date_pattern = r'\b(?:[A-Z][a-z]+ \d{1,2}(?:, \d{4})?)\b'
    
    # Find all matches
    matches = re.findall(date_pattern, text)
    
    if matches:
        # Use dateparser to parse the first match
        parsed_date = dateparser.parse(matches[0], languages=['en'])
        if parsed_date:
            return parsed_date.strftime('%Y-%m-%d')
    
    return None
def extract_date_time(text):
    """Extract date and time using the extract_first_date function."""
    return extract_first_date(text)

def extract_disaster_type(text, disaster_keywords):
    """Extract disaster type based on a predefined list."""
    disaster = [disaster for disaster in disaster_keywords if disaster in text.lower()]
    return disaster[0] if disaster else "Unknown"

def extract_short_description(text):
    """Extract a short description from the text, limited to two lines."""
    # Split text into sentences and keep only the first two
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    short_desc = ' '.join(sentences[:2])  # Join the first two sentences
    return short_desc

def process_df(df):
    """Apply extraction functions to the DataFrame."""
    disaster_keywords = [
        'earthquake', 'flood', 'hurricane', 'tornado', 'tsunami', 
        'volcano', 'cyclone', 'wildfire', 'landslide', 'avalanche',
        'drought', 'heatwave', 'blizzard', 'storm', 'typhoon', 
        'hailstorm', 'mudslide', 'sandstorm', 'tremor', 'aftershock'
    ]  # Example disaster types

    df['location'] = df['content'].apply(extract_location)  # Extract first two locations
    df['date'] = df['content'].apply(extract_date_time)  # Extract date
    df['disaster_type'] = df['content'].apply(lambda x: extract_disaster_type(x, disaster_keywords))  # Extract disaster type
    df['short_description'] = df['content'].apply(extract_short_description)  # Extract short description (2 lines)
    
    # Drop the 'date_time' column if it exists
    if 'date_time' in df.columns:
        df = df.drop(columns=['date_time'])
    
    return df


