<a href="https://colab.research.google.com/github/Vijaykumar-HealthGIS/NLP/blob/main/Twitte_cleaned_%26Translational.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries if not installed
!pip install deep-translator pandas emoji langdetect

import pandas as pd
import io
import re
import time
import emoji
from google.colab import files
from deep_translator import GoogleTranslator
from langdetect import detect, DetectorFactory

# Ensure consistent language detection
DetectorFactory.seed = 0

# Function to clean text (keeps languages but removes unwanted elements)
def clean_text(text):
    if pd.isna(text) or not isinstance(text, str):  # Handle NaN values or non-string input
        return ""

    # Remove complete sentences containing URLs
    text = re.sub(r'\b(?:https?|ftp):\/\/\S+\b', '', text)  # Remove URLs
    text = re.sub(r'\bwww\.\S+\b', '', text)  # Remove www. links
    text = re.sub(r'\bbit\.ly\S+\b', '', text)  # Remove shortened links

    # Remove any words related to links
    text = re.sub(r'\bclick here\b|\bvisit\b|\bwebsite\b|\bcheck out\b|\bmore info\b|\bsee more\b', '', text, flags=re.IGNORECASE)

    # Remove "RT", mentions (@username), and hashtags (#tag)
    text = re.sub(r'\bRT\b', '', text, flags=re.IGNORECASE)  # Remove "RT"
    text = re.sub(r'@\w+', '', text)  # Remove @mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags

    # Remove emojis
    text = emoji.replace_emoji(text, replace='')

    # Remove special characters (but keep multilingual text)
    text = re.sub(r'[\!\$\%\^\&\*\(\)\[\]\{\}\|\\\;\:\"\'\<\>\?]', '', text)  # Remove special characters

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Function to translate text into English
def translate_text(text):
    if not text.strip():  # Skip empty text
        return ""

    try:
        detected_lang = detect(text)  # Detect language

        if detected_lang == "en":  # If already in English, keep it unchanged
            return text

        time.sleep(0.3)  # Delay to prevent rate limits
        return GoogleTranslator(source='auto', target='en').translate(text)

    except Exception as e:
        print(f"Translation error for '{text}': {e}")
        return ""  # Return blank if translation fails

# Upload CSV file
uploaded = files.upload()

if not uploaded:
    print("No file uploaded. Please upload a CSV file.")
else:
    filename = list(uploaded.keys())[0]
    print(f"Uploaded file: {filename}")

    # Try reading the CSV file with different encodings
    try:
        df = pd.read_csv(io.BytesIO(uploaded[filename]), encoding='utf-8', on_bad_lines='skip')
        print("File read successfully using utf-8 encoding!")
    except Exception as e_utf:
        print(f"UTF-8 encoding error: {e_utf}")
        try:
            df = pd.read_csv(io.BytesIO(uploaded[filename]), encoding='latin-1', on_bad_lines='skip')
            print("File read successfully using latin-1 encoding!")
        except Exception as e_latin:
            print(f"Latin-1 encoding error: {e_latin}")
            df = None

    if df is not None and not df.empty:
        print("DataFrame loaded successfully! Preview:")
        print(df.head())
    else:
        print("DataFrame is empty or could not be loaded. Please check your CSV file.")

    # Check if 'Message' column exists
    if df is not None and "Message" in df.columns:
        # Step 1: Clean the 'Message' column and create 'Cleaned_Message'
        df["Cleaned_Message"] = df["Message"].apply(clean_text)
        print("Data cleaning completed! Preview:")
        print(df[["Message", "Cleaned_Message"]].head())

        # Step 2: Translate cleaned text into English
        df["English_Translation"] = df["Cleaned_Message"].apply(translate_text)
        print("Translation completed! Preview:")
        print(df[["Cleaned_Message", "English_Translation"]].head())

        # Step 3: Save and download the cleaned & translated file
        output_filename = "translated_file.csv"
        df.to_csv(output_filename, index=False, encoding='utf-8')
        print(f"File saved: {output_filename}")
        files.download(output_filename)
    else:
        print("Error: 'Message' column not found in the dataset!")




Saving new_ex_data.csv to new_ex_data.csv
Uploaded file: new_ex_data.csv
File read successfully using utf-8 encoding!
DataFrame loaded successfully! Preview:
   S.No.                                            Message
0      1  शहर के हर चौराहों पर रात को चर्चा आम हैं। आज च...
1      2  #AWHCL 470 made 900+ 🔥 \n#stockmarketsindia #R...
2      3  RT @rahulbhardwajcg राजनीति के मैदान में, भाजप...
3      4  Hope Indians can align the rest of the tactics...
4      5  RT @rahulsijariya @@BJP4India#madhyapradesh #b...
Data cleaning completed! Preview:
                                             Message  \
0  शहर के हर चौराहों पर रात को चर्चा आम हैं। आज च...   
1  #AWHCL 470 made 900+ 🔥 \n#stockmarketsindia #R...   
2  RT @rahulbhardwajcg राजनीति के मैदान में, भाजप...   
3  Hope Indians can align the rest of the tactics...   
4  RT @rahulsijariya @@BJP4India#madhyapradesh #b...   

                                     Cleaned_Message  
0  शहर के हर चौराहों पर रात को चर्चा आम हैं। आज च...  
1

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Install deep-translator if not installed
!pip install deep-translator pandas emoji

import pandas as pd
import io
import re
import time
import emoji
from google.colab import files
from deep_translator import GoogleTranslator

# Function to clean text (removes RT, @mentions, hashtags, special symbols, and emojis)
def clean_text(text):
    if pd.isna(text):  # Handle NaN values
        return ""

    text = str(text)  # Ensure it's a string
    text = re.sub(r'RT\s+', '', text, flags=re.IGNORECASE)  # Remove "RT"
    text = re.sub(r'[@#]\w+', '', text)  # Remove words starting with @ or #
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters & punctuation (excluding spaces)
    text = emoji.replace_emoji(text, replace='')  # Remove all emojis
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

# Upload CSV file
uploaded = files.upload()

if not uploaded:
    print("No file uploaded. Please upload a CSV file.")
else:
    filename = list(uploaded.keys())[0]
    print(f"Uploaded file: {filename}")

    # Try reading the CSV file with different encodings
    try:
        df = pd.read_csv(io.BytesIO(uploaded[filename]), encoding='utf-8', on_bad_lines='skip')
        print("File read successfully using utf-8 encoding!")
    except Exception as e_utf:
        print(f"UTF-8 encoding error: {e_utf}")
        try:
            df = pd.read_csv(io.BytesIO(uploaded[filename]), encoding='latin-1', on_bad_lines='skip')
            print("File read successfully using latin-1 encoding!")
        except Exception as e_latin:
            print(f"Latin-1 encoding error: {e_latin}")
            df = None

    if df is not None and not df.empty:
        print("DataFrame loaded successfully! Preview:")
        print(df.head())
    else:
        print("DataFrame is empty or could not be loaded. Please check your CSV file.")

    # Proceed if 'Message' column exists
    if df is not None and "Message" in df.columns:
        # Step 1: Clean the 'Message' column and create 'Cleaned_Message'
        df["Cleaned_Message"] = df["Message"].apply(clean_text)
        print("Data cleaning completed! Preview:")
        print(df[["Message", "Cleaned_Message"]].head())

        # Step 2: Translate cleaned text into English
        translator = GoogleTranslator(source='auto', target='en')

        def translate_text(text):
            if not text.strip():  # Skip empty text
                return ""
            try:
                time.sleep(0.3)  # Delay to prevent rate limits
                return translator.translate(text)
            except Exception as e:
                print(f"Translation error for '{text}': {e}")
                return text  # Return original if translation fails

        # Apply translation
        df["English_Translation"] = df["Cleaned_Message"].apply(translate_text)
        print("Translation completed! Preview:")
        print(df[["Cleaned_Message", "English_Translation"]].head())

        # Step 3: Save and download the cleaned & translated file
        output_filename = "translated_file.csv"
        df.to_csv(output_filename, index=False, encoding='utf-8')
        print(f"File saved: {output_filename}")
        files.download(output_filename)
    else:
        print("Error: 'Message' column not found in the dataset!")


Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading emoji-2.14.1-py3-none-any.whl (590 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji, deep-translator
Successfully installed deep-translator-1.11.4 emoji-2.14.1


Saving Final_data.csv to Final_data.csv
Uploaded file: Final_data.csv
File read successfully using utf-8 encoding!
DataFrame loaded successfully! Preview:
   Sender Followers Count  SenderAge SenderGender  \
0                      30          0            M   
1                    1500          0            M   
2                    3481         62          NaN   
3                       8          0          NaN   
4                     308         30            M   

                                             Message           CreatedTime  \
0  शहर के हर चौराहों पर रात को चर्चा आम हैं। आज च...  Jul 31 06:59:54 2024   
1  #AWHCL 470 made 900+ 🔥 \n#stockmarketsindia #R...  Jul 29 11:44:35 2024   
2  RT @rahulbhardwajcg राजनीति के मैदान में, भाजप...  Jul 29 09:24:18 2024   
3  Hope Indians can align the rest of the tactics...  Jul 27 13:00:19 2024   
4  RT @rahulsijariya @@BJP4India#madhyapradesh #b...  Jul 26 20:38:42 2024   

  CountryCode      Tweet Generator Sentiment           Ac

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>