In [3]:
import torch 
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import os 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import csv

myseed = 10  # set a random seed for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(myseed)
torch.manual_seed(myseed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(myseed)

In [4]:
data=pd.read_csv('random_sample.csv')

In [6]:
data.head()

Unnamed: 0,user_id,status_id,created_at,screen_name,text,source,display_text_width,reply_to_status_id,reply_to_user_id,reply_to_screen_name,...,statuses_count,favourites_count,account_created_at,verified,profile_url,profile_expanded_url,account_lang,profile_banner_url,profile_background_url,profile_image_url
0,1060370282,1392106273223610379,2021-05-11T13:16:14Z,RepMullin,📲 REMINDER: I will be hosting a Telephone Town...,Twitter Web App,180,,,,...,4902,1388,2013-01-04T12:43:55Z,True,https://t.co/Qv5teppuYQ,http://www.mullin.house.gov,,https://pbs.twimg.com/profile_banners/10603702...,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/9366845588...
1,818554054309715969,1284947108827746305,2020-07-19T20:23:58Z,SenJackyRosen,The best way the Senate can honor the legacy o...,Twitter for iPhone,110,,,,...,8870,611,2017-01-09T20:24:29Z,True,https://t.co/fq26igiB6n,http://www.rosen.senate.gov,,https://pbs.twimg.com/profile_banners/81855405...,,http://pbs.twimg.com/profile_images/1085182795...
2,385429543,988119683831234560,2018-04-22T18:17:45Z,RepMcNerney,"Today, many of you will be celebrating #EarthD...",Twitter for iPhone,227,,,,...,2781,593,2011-10-05T13:52:58Z,True,https://t.co/So4mXjTwdK,http://mcnerney.house.gov,,https://pbs.twimg.com/profile_banners/38542954...,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/4536029670...
3,1080573351914061825,1184820381082968064,2019-10-17T13:16:06Z,RepDavidTrone,Elijah Cummings was a voice for the powerless ...,Twitter for iPhone,275,,,,...,3987,522,2019-01-02T21:15:37Z,True,https://t.co/ROC81aFsfE,https://trone.house.gov,,https://pbs.twimg.com/profile_banners/10805733...,,http://pbs.twimg.com/profile_images/1110578625...
4,1065995022,906239678013169664,2017-09-08T19:35:51Z,RepWalorski,Learn about our #taxreform plan to help busine...,TweetDeck,145,,,,...,6296,702,2013-01-06T15:41:40Z,True,http://t.co/1c3JNm0kJ5,http://walorski.house.gov,,https://pbs.twimg.com/profile_banners/10659950...,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/1352299062...


Preprocessing

In [53]:
Missing_Count=data.isnull().sum()

In [62]:
Missing_Count

user_id                       0
status_id                     0
created_at                    0
screen_name                   0
text                          0
                          ...  
profile_expanded_url        891
account_lang              25000
profile_banner_url          187
profile_background_url     5789
profile_image_url             0
Length: 90, dtype: int64

In [72]:
No_Missing=list(Missing_Count[Missing_Count==0].index)
All_Missing=list(Missing_Count[Missing_Count==25000].index)
Most_Missing=list(Missing_Count[Missing_Count>20000].index)
Low_Missing=list(Missing_Count[Missing_Count<10000].index)

In [74]:
Low_Missing

['user_id',
 'status_id',
 'created_at',
 'screen_name',
 'text',
 'source',
 'display_text_width',
 'is_quote',
 'is_retweet',
 'favorite_count',
 'retweet_count',
 'lang',
 'status_url',
 'name',
 'location',
 'description',
 'url',
 'protected',
 'followers_count',
 'friends_count',
 'listed_count',
 'statuses_count',
 'favourites_count',
 'account_created_at',
 'verified',
 'profile_url',
 'profile_expanded_url',
 'profile_banner_url',
 'profile_background_url',
 'profile_image_url']

In [77]:
data.loc[:,'profile_image_url']

0        http://pbs.twimg.com/profile_images/9366845588...
1        http://pbs.twimg.com/profile_images/1085182795...
2        http://pbs.twimg.com/profile_images/4536029670...
3        http://pbs.twimg.com/profile_images/1110578625...
4        http://pbs.twimg.com/profile_images/1352299062...
                               ...                        
24995    http://pbs.twimg.com/profile_images/9990132279...
24996    http://pbs.twimg.com/profile_images/1268034070...
24997    http://pbs.twimg.com/profile_images/1267871657...
24998    http://pbs.twimg.com/profile_images/1387224813...
24999    http://pbs.twimg.com/profile_images/9790454464...
Name: profile_image_url, Length: 25000, dtype: object

reply_to_status_id        22449
reply_to_user_id          22377
reply_to_screen_name      22377
quote_count               25000
reply_count               25000
hashtags                  25000
symbols                   25000
urls_url                  25000
urls_t.co                 25000
urls_expanded_url         25000
media_url                 25000
media_t.co                25000
media_expanded_url        25000
media_type                25000
ext_media_url             25000
ext_media_t.co            25000
ext_media_expanded_url    25000
ext_media_type            25000
mentions_user_id          25000
mentions_screen_name      25000
quoted_status_id          22407
quoted_text               22407
quoted_created_at         22407
quoted_source             22407
quoted_favorite_count     22407
quoted_retweet_count      22407
quoted_user_id            22407
quoted_screen_name        22407
quoted_name               22407
quoted_followers_count    22407
quoted_friends_count      22407
quoted_s