# Reading the JSON file and extracting the relevant Tweets/info, Then inserting into the MongoDB collections

In [None]:
#importing necessry libraries
import numpy as np
import pandas as pd
import json
import pymongo #MongoDB
from datetime import datetime

In [None]:
#MongoDB Connection
# Connect to MongoDB database
mongo_client = pymongo.MongoClient("mongodb://localhost:27017")
mongo_db = mongo_client["Twitter_Database1"]
tweet_collection = mongo_db["Tweets1"]
quoted_tweet_collection = mongo_db["Quoted-Tweets1"]
retweet_collection = mongo_db["Retweets1"]

In [None]:
def insert_tweet(tweet_info,collec):
    """Insert a tweet document into the MongoDB collection."""
    try:
        collec.insert_one(tweet_info)
    except Exception as e:
        print(f"Error inserting tweet: {e}")

def find_tweet1(tweet_id,collec):
    """Check if a tweet with given ID exists in the MongoDB collection."""
    tweet = collec.find_one({"Tweet_Id": tweet_id})
    return tweet is not None

def find_tweet(tweet_id):
    """
    Check if a tweet with the given ID exists in any of the MongoDB collections.
    """
    for collection in [tweet_collection,retweet_collection,quoted_tweet_collection]:
        tweet = collection.find_one({"Tweet_Id": tweet_id})
        if tweet:
            return True
    return False

def update_tweet(data, collection):
    """
    Update a tweet document in the MongoDB collection if new counts are higher than existing counts.
    """
    tweet_info = extract_tweet_info(data)
    existing_tweet = collection.find_one({"Tweet_Id": tweet_info['Tweet_Id']})
    if existing_tweet:
        # Compare and update counts if new counts are higher
        for field in ['Retweet_Count', 'Quote_count', 'Likes_Count']:
            if tweet_info[field] > existing_tweet.get(field, 0):
                existing_tweet[field] = tweet_info[field]
        # Update the document in the collection
        collection.update_one({"Tweet_Id": tweet_info['Tweet_Id']}, {"$set": existing_tweet})
        return  # Exit loop after updating the first found document

def delete_tweet(tweet_id, collec):
    """Delete a tweet document from the MongoDB collection."""
    try:
        result = collec.delete_one({"Tweet_Id": tweet_id})
        if result.deleted_count == 1:
            print(f"Tweet with ID {tweet_id} deleted successfully.")
        else:
            print(f"No tweet found with ID {tweet_id}.")
    except Exception as e:
        print(f"Error deleting tweet: {e}")

def extract_tweet_info(data):
    """Extract relevant information from tweet data."""
    try:
        created_at = datetime.strptime(data['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
    except KeyError:
        created_at = None
    
    try:
        hashtag_list = [hashtag['text'] for hashtag in data['entities']['hashtags']]
    except KeyError:
        hashtag_list = []

    return {
        'created_at':datetime.strftime(datetime.strptime(data['created_at'],'%a %b %d %H:%M:%S +0000 %Y'),'%Y-%m-%d %H:%M:%S'),
        'Tweet_Id':data['id_str'],
        'Text':data['text'],
        'Hashtag':list(map(lambda x: x["text"], data['entities']['hashtags'])),
        'User_Id':data['user']['id_str'],
        'User_Name':data['user']['name'],
        'User_Screen_Name': data['user']['screen_name'],
        'Retweet_Count': data['retweet_count'],
        'Quote_count': data['quote_count'],
        'Likes_Count': data['favorite_count']
    }

def find_duplicate_tweets(tweet_id, collec):
    """Find duplicate tweets with the same Tweet_Id."""
    duplicate_tweets = collec.find({"Tweet_Id": tweet_id})
    return list(duplicate_tweets)

def find_duplicate_tweets(collec):
    """Find duplicate tweets based on content or Tweet_Id."""
    pipeline = [
        {"$group": {"_id": "$Tweet_Id", "count": {"$sum": 1}}},
        {"$match": {"count": {"$gt": 1}}}
    ]
    duplicate_tweets = collec.aggregate(pipeline)
    return list(duplicate_tweets)

In [None]:
def process_tweets(file_path):
    """Process tweets from a JSON file and insert them into the MongoDB collection."""
    with open(file_path, "r") as read_file:
        for line_number,line in enumerate(read_file,start=1):
            if line_number%2 !=0:
                data = json.loads(line)
                # check if tweet already exists
                if find_tweet1(data['id_str'],tweet_collection):
                    update_tweet(data,tweet_collection)
                    continue
                if find_tweet1(data['id_str'],retweet_collection):
                    update_tweet(data,retweet_collection)
                    continue
                if find_tweet1(data['id_str'],quoted_tweet_collection):
                    update_tweet(data,quoted_tweet_collection)
                    continue

                #check if retweet
                if ( data['text'].startswith('RT') ):
                    d1 = extract_tweet_info(data)
                    try:
                        d1['source_id'] = data['retweeted_status']['id_str']
                    except:
                        d1['source_id'] = 'NotFound'
                    insert_tweet(d1,retweet_collection)
                    if 'quoted_status' in data:
                        if find_tweet(data['quoted_status']['id_str']):
                            update_tweet(data['quoted_status'],tweet_collection)
                            continue
                        d2= extract_tweet_info(data['quoted_status'])
                        insert_tweet(d2,tweet_collection)
                        if 'retweeted_status' in data:
                            if find_tweet(data['retweeted_status']['id_str']):
                                update_tweet(data['retweeted_status'],quoted_tweet_collection)
                                continue
                            d3 = extract_tweet_info(data['retweeted_status'])
                            d3['source_id'] = data['quoted_status']['id_str']
                            insert_tweet(d3,quoted_tweet_collection)
                    if 'retweeted_status' in data and data['is_quote_status'] == False:
                        if find_tweet(data['retweeted_status']['id_str']):
                            update_tweet(data['retweeted_status'],tweet_collection)
                            continue
                        d2 = extract_tweet_info(data['retweeted_status'])
                        insert_tweet(d2,tweet_collection)

                #check if quoted tweet
                elif 'quoted_status' in data and (data['text'].startswith('RT') is False):
                    d3 = extract_tweet_info(data)
                    d3['source_id'] = data['quoted_status']['id_str']
                    insert_tweet(d3,quoted_tweet_collection)
                    if find_tweet(data['quoted_status']['id_str']):
                        update_tweet(data['quoted_status'],tweet_collection)
                        continue
                    d2 = extract_tweet_info(data['quoted_status'])
                    insert_tweet(d2,tweet_collection)
                    
                #if only tweet
                else:
                    d2 = extract_tweet_info(data)
                    insert_tweet(d2,tweet_collection)
                

In [None]:
process_tweets('corona-out-2.json')

In [None]:
process_tweets('corona-out-3.json')

# Processing the tweets further for Advanced Search Functionalities

In [None]:
# Further processing for Advanced Search
import iso639
import pymongo
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import string
from langdetect import detect_langs
from translate import Translator
import re

In [None]:
# Initialize NLTK resources
stop_words = set(stopwords.words('english'))  # English stopwords
lemmatizer = WordNetLemmatizer()

# Function to get language code from language name
def get_language_code(lang_name):
    try:
        lang_code = iso639.languages.get(part3b=lang_name).part1
        return lang_code
    except:
        return None
    
# Function to get synonyms for a word
def get_synonyms(word):
    synonyms = set()
    for synset in wn.synsets(word):
        for lemma in synset.lemmas():
            synonyms.add(lemma.name().lower())
    return synonyms

# Function to translate individual words to English
def translate_words_to_english(words, lang_code):
    translator = Translator(to_lang="en", from_lang=lang_code)
    translated_words = [translator.translate(word) for word in words]
    return translated_words

def preprocess_text(text):
    # Remove hyperlinks
    text = re.sub(r'http\S+', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords and punctuation, preserving certain symbols
    preserved_symbols = ["@", "#", ".", "_"]  # Add any other symbols to be preserved
    tokens = [token.lower() for token in tokens if token.lower() not in stop_words and token not in string.punctuation or any(symbol in token for symbol in preserved_symbols)]
    # Remove numbers
    tokens = [token for token in tokens if not token.isdigit()]
    # Lemmatization and synonym inclusion
    lemmatized_tokens_with_synonyms = []
    for token in tokens:
        lemmatized_tokens_with_synonyms.append(token)
        synonyms = get_synonyms(token)
        lemmatized_tokens_with_synonyms.extend(list(synonyms))
    return list(set(lemmatized_tokens_with_synonyms))

def sentiment_analysis(text):
    analyzer = SentimentIntensityAnalyzer()
    sentiment_score = analyzer.polarity_scores(text)
    if sentiment_score["compound"] >= 0.55:
        result = "Positive"
    elif sentiment_score["compound"] <= 0.45:
        result = "Negative"
    else:
        result = "Neutral"
    return result

In [None]:
def analyzeText(collection):
    # Process text and save to new field
    for tweet in collection.find():
        tweet_text = tweet["Text"]
        try:
            detected_langs = detect_langs(tweet_text)  # Detect language using langdetect
            lang_name = detected_langs[0].lang  # Get the most likely language name
            print(f"Detected language: {lang_name}")  # Debug print for detected language
            if lang_name == 'en':  # English tweets
                processed_text = preprocess_text(tweet_text)
                sentiment = sentiment_analysis(tweet_text)
                collection.update_one({"_id": tweet["_id"]}, {"$set": {"processed_text": processed_text, "language": lang_name, "sentiment": sentiment}})
                print(f"Processed text for tweet {tweet['_id']}")
            else:
                if lang_name is not None:
                    words = word_tokenize(tweet_text)
                    translated_words = translate_words_to_english(words, lang_name)
                    translated_text = ' '.join(translated_words)
                    processed_text = preprocess_text(translated_text)
                    sentiment = sentiment_analysis(tweet_text)
                    collection.update_one({"_id": tweet["_id"]}, {"$set": {"processed_text": processed_text, "language": lang_name, "sentiment": sentiment}})
                    print(f"Processed text for tweet {tweet['_id']}")
                else:
                    print(f"Unsupported language: {lang_name}")
        except Exception as e:
            print(f"Error processing tweet {tweet['_id']}: {e}")

In [None]:
analyzeText(tweet_collection)

In [None]:
analyzeText(retweet_collection)

In [None]:
analyzeText(quoted_tweet_collection)