# Load data with the new search url

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv(r"..\data\tweets_01_02_2019.csv")

In [3]:
df.shape

(240, 6)

In [4]:
df.drop_duplicates(inplace=True)

In [5]:
df.shape

(40, 6)

In [6]:
df.lang.value_counts(dropna=False)

en     33
ro      2
it      2
fr      1
und     1
bg      1
Name: lang, dtype: int64

In [7]:
english_tweets = df.loc[df.lang == 'en', :]

In [8]:
english_tweets.head()

Unnamed: 0,hour_created,lang,stats,time_created,tweet_text,user_name
0,3:06 AM - 30 Jan 2019,en,"['0 replies', '0 retweets', '0 likes']",Jan 30,"@ndr about Plovdiv, ""The Ancient Plovdiv has b...",@Plovdiv2019
1,6:34 AM - 31 Jan 2019,en,"['0 replies', '5 retweets', '13 likes']",9h9 hours ago,The Southern-Westphalian newspaper Siegener Ze...,@Plovdiv2019
2,10:01 AM - 30 Jan 2019,en,"['0 replies', '0 retweets', '1 like']",Jan 30,"RT Matera2019 ""RT IFLA: Congratulations to #Ma...",@kalauras
3,3:04 AM - 30 Jan 2019,en,"['0 replies', '3 retweets', '8 likes']",Jan 30,Muzeiko is expecting you in Plovdiv! Playing k...,@Plovdiv2019
4,11:40 AM - 29 Jan 2019,en,"['0 replies', '5 retweets', '16 likes']",Jan 29,The official ceremony of the opening of Europe...,@UnravelTravelTV


In [9]:
english_tweets['time_created'].value_counts()

Jan 30           3
Jan 12           3
Jan 13           3
Jan 23           2
Jan 22           2
Jan 15           2
Jan 7            2
Jan 29           2
Jan 11           2
9h9 hours ago    1
Jan 19           1
Jan 31           1
Jan 20           1
2 Jan 2018       1
Jan 18           1
Jan 28           1
Jan 3            1
Jan 27           1
2h2 hours ago    1
Jan 17           1
Jan 25           1
Name: time_created, dtype: int64

In [10]:
%run ../dataprep/data_clean.py
# load the two methods in data_clean.py
# - extract_tags and tweet_cleaner

In [11]:
english_tweets['tags'] = english_tweets.apply(lambda x: extract_tags(x.tweet_text), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [12]:
english_tweets.columns

Index(['hour_created', 'lang', 'stats', 'time_created', 'tweet_text',
       'user_name', 'tags'],
      dtype='object')

In [13]:
english_tweets.shape

(33, 7)

![title](https://cdn-images-1.medium.com/max/1600/1*Xhm9c9qDfXa3ZCQjiOvm_w.jpeg)

In [14]:
english_tweets['cleaned_tweets'] = english_tweets.apply(lambda x: tweet_cleaner(x['tweet_text']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [15]:
english_tweets.cleaned_tweets.head()

0     about plovdiv   the ancient plovdiv has been ...
1    the southern westphalian newspaper siegener ze...
2    rt matera      rt ifla  congratulations to mat...
3    muzeiko is expecting you in plovdiv  playing k...
4    the official ceremony of the opening of europe...
Name: cleaned_tweets, dtype: object

In [16]:
import spacy

nlp = spacy.load('en')

In [17]:
def remove_stop_and_punc(sentence):
    return [token.lemma_ for token in nlp(sentence) 
            if not token.is_stop and not token.is_punct and not token.is_space]

In [18]:
sentence = "This is great day to write your project for NLP."

In [19]:
remove_stop_and_punc(sentence)

['this', 'great', 'day', 'write', 'project', 'nlp']

In [20]:
english_tweets.columns

Index(['hour_created', 'lang', 'stats', 'time_created', 'tweet_text',
       'user_name', 'tags', 'cleaned_tweets'],
      dtype='object')

In [26]:
english_tweets['token_tweets'] = english_tweets['cleaned_tweets'].apply(remove_stop_and_punc)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [28]:
english_tweets['token_tweets'][0]

['plovdiv',
 'ancient',
 'plovdiv',
 'cultural',
 'centre',
 'long',
 'time',
 'prestigious',
 'german',
 'television',
 'event',
 'opening',
 'weekend',
 'plovdiv',
 'ecoc',
 'ecocfamily']

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Tweets_mix = []

# for i, row in enumerate(tokenized_data):
#     all_row = ""
#     for sent in row:
#         all_row += sent
#     Tweets_mix.append(all_row)

tfidf_model = TfidfVectorizer(max_df=0.9, max_features=1000,
                              min_df=0.1, stop_words='english',
                              use_idf=True, tokenizer=None, ngram_range=(1,1))

tfidf_matrix = tfidf_model.fit_transform(english_tweets.tweet_text) #fit the vectorizer to synopses

print ("In total, there are " + str(tfidf_matrix.shape[0]) + \

      " synoposes and " + str(tfidf_matrix.shape[1]) + " terms.")

In total, there are 33 synoposes and 20 terms.


In [35]:
from sklearn.cluster import KMeans

num_clusters = 2
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)
english_tweets['kmeans'] = pd.Series(km.labels_)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [38]:
english_tweets.loc[english_tweets.kmeans == 1.0, 'tweet_text']

0     @ndr about Plovdiv, "The Ancient Plovdiv has b...
3     Muzeiko is expecting you in Plovdiv! Playing k...
5     ‘Changing’ is is the first solo performance of...
6     The exhibition "Neosvetenite dvorove" is devot...
8     #awEare with @GommalaccaT #Together #Plovdiv20...
10    Our programme for 2019 is offering plenty of e...
19    Euripides’ tragedy "Medea" will be presented o...
Name: tweet_text, dtype: object

In [31]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
 
accuracy = accuracy_score(list(map(int, sentiment)), clusters)
precision, recall, f1_score, _ = precision_recall_fscore_support(list(map(int, sentiment)), clusters, average='binary')
 
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 score: ", f1_score)

NameError: name 'sentiment' is not defined