In [3]:
#Performed on youtube comments dataset
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from translate import Translator
import emoji
from textblob import TextBlob

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

file_path = 'youtube_comments.csv'
output_file_path = 'preprocessed_youtube_comments.csv'

df = pd.read_csv(file_path)

df['Comment_Lower'] = df['Comment'].str.lower()

df['Tokens'] = df['Comment_Lower'].apply(nltk.word_tokenize)

df['No_Punctuation'] = df['Tokens'].apply(lambda x: [word for word in x if word.isalnum()])

stop_words = set(stopwords.words('english'))
df['No_Stopwords'] = df['No_Punctuation'].apply(lambda x: [word for word in x if word not in stop_words])

stemmer = PorterStemmer()
df['Stemmed'] = df['No_Stopwords'].apply(lambda x: [stemmer.stem(word) for word in x])

lemmatizer = WordNetLemmatizer()
df['Lemmatized'] = df['No_Stopwords'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

translator = Translator(to_lang="gu")

def translate_to_gujarati(text):
    try:
        translated = translator.translate(text)
        return translated
    except:
        return text

df['Translated'] = df['Comment_Lower'].apply(translate_to_gujarati)

df['Emoji_Text'] = df['Comment'].apply(lambda x: emoji.demojize(x))

df.to_csv(output_file_path, index=False)

print(df.head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


                                             Comment  \
0  I materials had code if i want both displayed ...   
1  About time excel got with the picture. But fir...   
2  Was this a phased release? My images are showi...   
3  Wow!!<br>Thanks a lot for sharing these nice t...   
4  This is super helpful and useful as well! Than...   

                                       Comment_Lower  \
0  i materials had code if i want both displayed ...   
1  about time excel got with the picture. but fir...   
2  was this a phased release? my images are showi...   
3  wow!!<br>thanks a lot for sharing these nice t...   
4  this is super helpful and useful as well! than...   

                                              Tokens  \
0  [i, materials, had, code, if, i, want, both, d...   
1  [about, time, excel, got, with, the, picture, ...   
2  [was, this, a, phased, release, ?, my, images,...   
3  [wow, !, !, <, br, >, thanks, a, lot, for, sha...   
4  [this, is, super, helpful, and, useful, as,

In [1]:
# Dataset Given in Assignment
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from translate import Translator
import emoji
from textblob import TextBlob

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

file_path = 'dhoniretires_tweets.csv'
output_file_path = 'preprocessed_dhoniretires_tweets.csv'

df = pd.read_csv(file_path)

df['text_lower'] = df['text'].str.lower()

df['tokens'] = df['text_lower'].apply(nltk.word_tokenize)

df['no_punctuation'] = df['tokens'].apply(lambda x: [word for word in x if word.isalnum()])

stop_words = set(stopwords.words('english'))
df['no_stopwords'] = df['no_punctuation'].apply(lambda x: [word for word in x if word not in stop_words])

stemmer = PorterStemmer()
df['stemmed'] = df['no_stopwords'].apply(lambda x: [stemmer.stem(word) for word in x])

lemmatizer = WordNetLemmatizer()
df['lemmatized'] = df['no_stopwords'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

translator = Translator(to_lang="gu")

def translate_to_gujarati(text):
    try:
        translated = translator.translate(text)
        return translated
    except:
        return text

df['translated'] = df['text_lower'].apply(translate_to_gujarati)

df['emoji_text'] = df['text'].apply(lambda x: emoji.demojize(x))

df.to_csv(output_file_path, index=False)

print(df.head())


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                  user_name          user_location  \
0              Ravi Shastry  All 3 places possible   
1        India Today Sports           Noida, India   
2              Augmont Gold                 Mumbai   
3        Papa Louie's pizza                    NaN   
4  🇮🇳 पंडित जीवेश मिश्रा 🇮🇳  JAUNPUR UP INDIA 🇮🇳🇮🇳   

                                    user_description      user_created  \
0  Tracer bullet on TV, Enfield Bullet in life. W...  03-04-2015 14:15   
1  Live cricket scores, news, analysis and fun fa...  26-09-2017 10:57   
2  With Augmont Gold, you can now invest in as lo...  12-06-2012 12:10   
3                                                NaN  18-07-2020 05:36   
4  👍 सत्य बोलने के लिए किसी के आदेश की जरूरत नहीं...  01-08-2017 17:24   

   user_followers  user_friends  user_favourites  user_verified  \
0             578            66               86          False   
1            6434            15               10          False   
2            2112            