# Import Libraries

In [None]:
import pandas as pd
import numpy as np

import regex as re 
import string
import nltk
import datetime


from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stopwords_eng = stopwords.words("english")
from nltk.corpus import wordnet


!pip install wordcloud
from wordcloud import WordCloud
import seaborn as sns

!pip install autocorrect
from autocorrect import Speller
spell = Speller()

# import tqdm for progress bars
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

## Import csv files

In [None]:
altcoin_df = pd.read_csv("altcoin.csv") 
appl_df = pd.read_csv("APPL.csv") 
bitcoin_df = pd.read_csv("bitcoin.csv") 
coindesk_df = pd.read_csv("coindesk.csv") 
crypto_df = pd.read_csv("Cryptocurrency.csv") 
gold_df = pd.read_csv("Gold.csv") 
goog_df = pd.read_csv("GOOG.csv") 
yhoo_df = pd.read_csv("YHOO.csv") 

### Check first few rows and shape of each dataframe

In [None]:
altcoin_df.head()

In [None]:
altcoin_df.shape

In [None]:
appl_df.head()

In [None]:
appl_df.shape

In [None]:
bitcoin_df.head()

In [None]:
bitcoin_df.shape

In [None]:
coindesk_df.head()

In [None]:
coindesk_df.shape

In [None]:
crypto_df.head()

In [None]:
crypto_df.shape

In [None]:
gold_df.head()

In [None]:
gold_df.shape

In [None]:
goog_df.head()

In [None]:
goog_df.shape

In [None]:
yhoo_df.head()

In [None]:
yhoo_df.shape

#### Here, it can be noticed that all dataframes have same features, so we can move forward to concatinating the data in axis=0 (row-wise) and we can further perform the EDA in the merged dataframe

### Concat Dataframes

In [None]:
trade_df = pd.concat([altcoin_df, appl_df, bitcoin_df, coindesk_df, crypto_df,
                     gold_df, goog_df, yhoo_df], axis=0)

In [None]:
trade_df.reset_index(drop=True, inplace=True)

# Saving DataFrame in CSV File 

In [None]:
trade_df.to_csv('trade_df.csv', index=False)  

## Exploratory Data Analysis (EDA)

In [None]:
trade_df.shape

In [None]:
trade_df.head()

#### Get information of the dataframe columns

In [None]:
trade_df.info()

**Datetime column is of type object, so we need to convert that into datetime**

In [None]:
trade_df['Datetime'] = trade_df['Datetime'].astype('datetime64')

In [None]:
trade_df.info()

In [None]:
trade_df['Tweet Id'][:5]

In [None]:
trade_df.duplicated(keep='first').sum()

In [None]:
trade_df[trade_df['Tweet Id']==1634342993812414464]

In [None]:
trade_df.drop_duplicates(inplace=True)

In [None]:
trade_df[trade_df['Tweet Id']==1634342993812414464]

In [None]:
trade_df.shape

# setting max_colwidth to None to see all the strings of longer length on a single line.


In [None]:
pd.set_option('display.max_colwidth', None)


In [None]:
trade_df.sample()

# Dropping URL Column 

We are looking at URL and User column. Since , URL doesnot have any specific information stored in it other than tweet id and username which we already have in two other columns , we are dropping the URL column. 

In [None]:
trade_df.drop(columns=['URL'],inplace=True)

In [None]:
trade_df.head()

# Getting only username from User column 

In [None]:
def get_username(user_link):
    get_username=re.sub(r'http[s]?://twitter.com/','',user_link)
    return get_username

In [None]:
trade_df['User']=trade_df['User'].apply(get_username)

In [None]:
trade_df['User'].value_counts().shape


In [None]:
trade_df.head(2)

In [None]:
trade_df['Text'][0]

# Pre-Processing Text Column 

In [None]:
def to_lowercase(text):
    #convert to lowercase
    text = text.lower()
    return text

# Remove hashtags and words followed by punctuation


In [None]:
def remove_hashtags(text):
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'(\$[A-z]+)', '', text)

    return text

# Remove Multiple Spaces

In [None]:
def remove_spaces(org_text):
    # remove multiple spaces
    text = re.sub(r'\s+', ' ', org_text)
    return text

# Remove Punctuation

In [None]:
# function to remove punctuation
def remove_punctuation(text): 
    return text.translate(str.maketrans('','', string.punctuation))

# Remove Stopwords

In [None]:
# remove stop words
def remove_stop_words(tokens):
    return [t for t in tokens if t not in stopwords_eng]

# Remove Urls 

In [None]:
def remove_urls(desc): 
    return re.sub(r'http[s]?://t.co/', '', desc)

# Remove Non English Characters

In [None]:
# remove non english characters using regex
def remove_non_eng_chrs(text): 
    return re.sub('[^a-zA-Z]', ' ', text)

# Fix length of repeating characters

In [None]:
# function to fix length of characters
def spell_len_fix(text): 
    return re.sub(r'(.)\1{2,}', r'\1\1', text)

# Remove emoji's and other language characters

In [None]:
# remove non ascii characters like emoji and other language chars
def remove_non_ascii_chrs(text):
    return re.sub(r'[^\x00-\x7F]+', '', text)

# Fix Spellings 

In [None]:
# function to correct spelling
def correct_spelling(tokens):
    return [spell(t) for t in tokens]

# Remove new line characters

In [None]:
def new_line(text):
    text = re.sub('[\r\n]+', ' ', text)
    return text

# Tokenize

In [None]:
# fucntion to tokenize
def tokenize(text): 
    return word_tokenize(text)

# Remove Whitespaces

In [None]:
# remove whitespaces
def remove_whitespace(tokens): 
    return [t.strip() for t in tokens]

# Lemmatization 

In [None]:

lem = WordNetLemmatizer()
# lemmaatization 
def lemmatization(tokens):
    return [lem.lemmatize(t) for t in tokens]

# Preprocessing Pipeline

In [None]:
# fucntion to preprocess the text
def preprocess_pipeline(org_text): 
  
    
    text = remove_urls(org_text)
    text = to_lowercase(text)
    text = remove_hashtags(text)
    text = remove_spaces(text)
    
    text = remove_punctuation(text)
    text = remove_non_eng_chrs(text) 
    text = spell_len_fix(text)
    text = remove_non_ascii_chrs(text)
    
    text = new_line(text)
    tokens =nltk.word_tokenize(text)
   # text = correct_spelling(tokens)
    tokens = remove_whitespace(tokens)
    tokens=remove_stop_words(tokens)
    tokens = lemmatization(tokens)
    return " ".join(tokens)

 

In [None]:
# preprocess text, using tqdm for displaying progress bar
tqdm.pandas(desc="Pre-Processing Progress")
trade_df['preprocessed_text'] = trade_df['Text'].progress_apply(preprocess_pipeline)

In [None]:
trade_df.head()

In [None]:
#Creating a word cloud from text
from wordcloud import WordCloud
import matplotlib.pyplot as plt
titles_text = ' '.join(trade_df['preprocessed_text'])
wordcloud = WordCloud(width=800, height=500, background_color='white').generate(titles_text)

# Display the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Twitter Text Word Cloud')
plt.axis('off')
plt.tight_layout()
plt.show()   

In [None]:
#extract all the hashtags from our text
trade_df['hashtags_all'] = trade_df['Text'].str.findall(r'#\w+')
#counts the occurance of each hashtags 
hashtags_all=trade_df['Text'].str.findall(r'#\w+')
count_hashtags= hashtags_all.explode().value_counts()
count_hashtags[:10]

# bar diagram of top 10 hashtags with its counts

In [None]:
plt.figure(figsize=(20, 8))
sns.set(font_scale=1.5)

sns.barplot(x=count_hashtags.index[1:11], y=count_hashtags[1:11],palette='magma')
plt.title('Top 10 #Hashtag',fontsize=24)
plt.xticks(rotation=45)
plt.xlabel('Most frequent hashtags',fontsize=24)
plt.ylabel('Count',fontsize=24)
plt.show()

# Top 10 most active users 

In [None]:
plt.figure(figsize=(20, 10))
sns.set(font_scale=2)
tweets_by_user = trade_df.groupby('User').size().sort_values(ascending=False)
sns.barplot(x=tweets_by_user[:10], y=tweets_by_user[:10].index,palette='magma')
plt.title('Top 10 active users by tweet volume',fontsize=24)
plt.xlabel('Count',fontsize=24)
plt.ylabel('Users',fontsize=24)
plt.show()

# WordCloud of Users 

In [None]:
all_users = ' ' .join(tweets_by_user[1:] for tweets_by_user in trade_df['User'])

wordcloud_twitter = WordCloud(height=1000, width=1000,background_color="white",).generate(all_users)
plt.figure(figsize=(10, 10))
plt.imshow(wordcloud_twitter, interpolation="bilinear")
plt.title('Word Cloud for Users')
plt.axis('off')
plt.tight_layout()
plt.show

# Daily number of tweet 

In [None]:
trade_df.head()

In [None]:
trade_df['Datetime'] = pd.to_datetime(trade_df['Datetime'], format='%d/%m/%Y')


In [None]:
trade_df['Date'] = trade_df['Datetime'].dt.date

In [None]:
trade_df.head()

In [None]:
trade_df['Date'].value_counts()

In [None]:
tweets_per_day = trade_df.groupby('Date').size().sort_values(ascending=False)


In [None]:
plt.figure(figsize=(20, 8))
sns.set(font_scale=1.5)

sns.barplot(x=tweets_per_day.index, y=tweets_per_day,palette='magma')
plt.title('Tweets per day',fontsize=24)
plt.xticks(rotation=45)
plt.xlabel('Date',fontsize=24)
plt.ylabel('Count',fontsize=24)
plt.show()
#3/100 ma k k words thyo 

# Maximum Number of Tweets by User Per Day

In [None]:
trade_df.head()

In [None]:
tweets_per_user_per_day = trade_df.groupby(['User','Date']).size().sort_values(ascending=False)
tweets_per_user_per_day_reset= tweets_per_user_per_day.reset_index(name='Count Per Day')[:10]
tweets_per_user_per_day_reset

In [None]:
plt.figure(figsize=(20, 8))
sns.set(font_scale=1.5)

sns.barplot(x=tweets_per_user_per_day_reset['User'], y=tweets_per_user_per_day_reset['Count Per Day'],hue=tweets_per_user_per_day_reset['Date'],dodge=False)
plt.title('Tweets per day per user ',fontsize=24)
#plt.xticks(rotation=45)
plt.xlabel('User',fontsize=24)
plt.ylabel('Tweets per day',fontsize=24)
plt.show()

# Frequent hashtags per day 

In [None]:
trade_df.head()

We reported that numbers of tweets in date 2023-03-10 is significantly higher than any other dates.

In [None]:
hashtags_ten= trade_df[trade_df['Date']== datetime.date(2023,3,10) ]


In [None]:
daily_hashtags = ' ' .join([hashtags_ten[1:] for hashtags_ten in hashtags_ten['Text'].str.findall(r'#\w+') for hashtags_ten in hashtags_ten])

wordcloud_percount = WordCloud(height=800, width=1000,background_color="white",).generate(daily_hashtags)
plt.figure(figsize=(8, 10))
plt.imshow(wordcloud_percount, interpolation="bilinear")
plt.title('Word Cloud for Date:2023-03-10 Hashtags')
plt.axis('off')
plt.tight_layout()
#plt.savefig("twitter_logo_unigram_hashtags.png", format="png")
plt.show

In [None]:
hashtags_nine= trade_df[trade_df['Date']== datetime.date(2023,3,8) ]

daily_hashtags = ' ' .join([hashtags_nine[1:] for hashtags_nine in hashtags_nine['Text'].str.findall(r'#\w+') for hashtags_nine in hashtags_nine])

wordcloud_percount = WordCloud(height=800, width=800,background_color="white",).generate(daily_hashtags)
plt.figure(figsize=(8, 10))
plt.imshow(wordcloud_percount, interpolation="bilinear")
plt.title('Word Cloud for Date:2023-03-09 Hashtags')
plt.axis('off')
plt.tight_layout()
#plt.savefig("twitter_logo_unigram_hashtags.png", format="png")
plt.show

In [None]:
def daily_hashtags(df,title):
#counts the occurance of each hashtags 
    hashtags_all=df['Text'].str.findall(r'#\w+')
    count_hashtags= hashtags_all.explode().value_counts()
    #count_hashtags
    plt.figure(figsize=(20, 8))
    sns.set(font_scale=1.5)

    sns.barplot(x=count_hashtags.index[1:11], y=count_hashtags[1:11],palette='magma')
    plt.title(f'Top 10 #Hashtag-{title}',fontsize=24)
    plt.xticks(rotation=45)
    plt.xlabel('Most frequent hashtags',fontsize=24)
    plt.ylabel('Count',fontsize=24)
    return plt.show()

In [None]:
for dates in trade_df['Date'].unique():
    #date_formatting= re.sub(r'[\-]', r',',str(dates))
    daily_df= trade_df[trade_df['Date']== dates]
    daily_hashtags(daily_df,dates)


In [None]:
def daily_hashtags(df,title):
#counts the occurance of each hashtags 
    keywords=['altcoin', 'bitcoin','coindesk','cryptocurrency','gold','appl','goog','Yhoo']
    hashtags_all=df['Text'].str.findall(r'#\w+')
    
    count_hashtags= hashtags_all.explode().value_counts()
    
    #count_hashtags
    plt.figure(figsize=(20, 8))
    sns.set(font_scale=1.5)

    sns.barplot(x=count_hashtags.index[1:11], y=count_hashtags[1:11],palette='magma')
    plt.title(f'Top 10 #Hashtag-{title}',fontsize=24)
    plt.xticks(rotation=45)
    plt.xlabel('Most frequent hashtags',fontsize=24)
    plt.ylabel('Count',fontsize=24)
    return plt.show()

In [None]:
for dates in trade_df['Date'].unique():
    #date_formatting= re.sub(r'[\-]', r',',str(dates))
    daily_df= trade_df[trade_df['Date']== dates]
    daily_hashtags(daily_df,dates)


# Keywords Analysis

In [None]:
# Given keywords
keywords = ['Altcoin', 'Bitcoin', 'Coindesk', 'Cryptocurrency', 'Gold', 'APPL', 'GOOG', 'YHOO']

# DataFrame to store results
results = pd.DataFrame(index=pd.unique(trade_df['Date']), columns=keywords)
results.index.name = 'Date'


user_counts = {keyword: {} for keyword in keywords}

for keyword in keywords:
    # Count tweets containing the keyword for each day
    results[keyword] = trade_df[trade_df['Text'].str.contains(keyword, case=False, na=False)].groupby('Date').size()
    # Count unique users mentioning the keyword for each day
    user_counts[keyword] = trade_df[trade_df['Text'].str.contains(keyword, case=False, na=False)]['User'].nunique()

# Convert user_counts to DataFrame
user_results = pd.DataFrame.from_dict(user_counts, orient='index', columns=['Daily Users'])

# Daily number of tweets for each keyword

In [None]:
# Visualization of daily number of tweets for each keyword
plt.figure(figsize=(12, 6))
for keyword in keywords:
    plt.plot(results.index, results[keyword], label=keyword)
plt.title('Daily Number of Tweets')
plt.xlabel('Date')
plt.ylabel('Number of Tweets')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


# Daily Number of users for each keyword 

In [None]:
#daily number of users for each keyword
plt.figure(figsize=(12, 6))
plt.bar(user_results.index, user_results['Daily Users'])
plt.title('Daily Number of Users')
plt.xlabel('Keyword')
plt.ylabel('Number of Users')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()